SkBlitRow_opts_SSE2.cpp revision 275804782f7b752cc9c25cb556db2a0cfc711dd9
19272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org/*
298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com * Copyright 2012 The Android Open Source Project
3ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com *
4ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * Use of this source code is governed by a BSD-style license that can be
5ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * found in the LICENSE file.
69272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org */
79272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
8ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com
94e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org#include "SkBlitRow_opts_SSE2.h"
1083ecdc3ac69c9208493c4c3fc8ea9f84b1350535caryclark@google.com#include "SkBitmapProcState_opts_SSE2.h"
119272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#include "SkColorPriv.h"
12475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org#include "SkColor_opts_SSE2.h"
13275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org#include "SkDither.h"
14c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org#include "SkUtils.h"
159272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
169272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#include <emmintrin.h>
179272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
189272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org/* SSE2 version of S32_Blend_BlitRow32()
199272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org * portable version is in core/SkBlitRow_D32.cpp
209272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org */
214e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
224e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                              const SkPMColor* SK_RESTRICT src,
234e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                              int count, U8CPU alpha) {
249272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    SkASSERT(alpha <= 255);
259272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    if (count <= 0) {
269272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        return;
279272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
289272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
299272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    uint32_t src_scale = SkAlpha255To256(alpha);
309272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    uint32_t dst_scale = 256 - src_scale;
319272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
32dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    if (count >= 4) {
33dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        SkASSERT(((size_t)dst & 0x03) == 0);
34dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (((size_t)dst & 0x0F) != 0) {
35dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
36dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src++;
37dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst++;
38dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count--;
39dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
40dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
41dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        const __m128i *s = reinterpret_cast<const __m128i*>(src);
42dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i *d = reinterpret_cast<__m128i*>(dst);
43dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
4498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com        __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
4598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com
4698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com        // Move scale factors to upper byte of word
4798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com        __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
4898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com        __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
49dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (count >= 4) {
50dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Load 4 pixels each of src and dest.
51dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
52dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
53dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
5498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Interleave Atom port 0/1 operations based on the execution port
5598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // constraints that multiply can only be executed on port 0 (while
5698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // boolean operations can be executed on either port 0 or port 1)
5798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // because GCC currently doesn't do a good job scheduling
5898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // instructions based on these constraints.
5998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com
60dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Get red and blue pixels into lower byte of each word.
6198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
62dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
63dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
6498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Multiply by scale.
6598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (4 x (0, rs.h, 0, bs.h))
6698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // where rs.h stands for the higher byte of r * scale, and
6798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // bs.h the higher byte of b * scale.
6898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
6998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com
7098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Get alpha and green pixels into higher byte of each word.
7198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
7298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
73dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
74dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by scale.
7598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (4 x (as.h, as.l, gs.h, gs.l))
7698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
77dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
7898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Clear the lower byte of the a*scale and g*scale results
7998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (4 x (as.h, 0, gs.h, 0))
8098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            src_ag = _mm_and_si128(src_ag, ag_mask);
8198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com
8298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Operations the destination pixels are the same as on the
8398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // source pixels. See the comments above.
8498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
8598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
8698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
8798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
8898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            dst_ag = _mm_and_si128(dst_ag, ag_mask);
89dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
90dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Combine back into RGBA.
9198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (4 x (as.h, rs.h, gs.h, bs.h))
92dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_pixel = _mm_or_si128(src_rb, src_ag);
93dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
94dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
95dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Add result
96dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
97dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            _mm_store_si128(d, result);
98dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            s++;
99dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            d++;
100dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count -= 4;
101dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
102dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
103dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        dst = reinterpret_cast<SkPMColor*>(d);
1049272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
1059272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
1064e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org    while (count > 0) {
1079272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
1089272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        src++;
1099272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        dst++;
1109272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        count--;
1119272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
1129272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org}
1139272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
1144e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
1154e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                                const SkPMColor* SK_RESTRICT src,
1164e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                                int count, U8CPU alpha) {
1179272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    SkASSERT(alpha == 255);
1189272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    if (count <= 0) {
1199272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        return;
1209272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
121dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
122dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    if (count >= 4) {
123dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        SkASSERT(((size_t)dst & 0x03) == 0);
124dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (((size_t)dst & 0x0F) != 0) {
125dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            *dst = SkPMSrcOver(*src, *dst);
126dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src++;
127dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst++;
128dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count--;
129dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
130dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
131dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        const __m128i *s = reinterpret_cast<const __m128i*>(src);
132dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i *d = reinterpret_cast<__m128i*>(dst);
1339272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#ifdef SK_USE_ACCURATE_BLENDING
134dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
135dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
136dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
137dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (count >= 4) {
138dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Load 4 pixels
139dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
140dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
141dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
142dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
143f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
144dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Shift alphas down to lower 8 bits of each quad.
145dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i alpha = _mm_srli_epi32(src_pixel, 24);
146dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
147dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Copy alpha to upper 3rd byte of each quad
148dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
149dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
150dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Subtract alphas from 255, to get 0..255
151dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            alpha = _mm_sub_epi16(c_255, alpha);
152dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
153dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by red and blue by src alpha.
154dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
155dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by alpha and green by src alpha.
156dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
157dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
158dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // dst_rb_low = (dst_rb >> 8)
159dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
160dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
161dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
162dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
163dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
164dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_add_epi16(dst_rb, c_128);
165dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_srli_epi16(dst_rb, 8);
166dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
167dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
168dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
169dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_add_epi16(dst_ag, c_128);
170dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
171dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
172dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Combine back into RGBA.
173dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
174dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
175dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Add result
176dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
177dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            _mm_store_si128(d, result);
178dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            s++;
179dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            d++;
180dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count -= 4;
181dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
182dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    #else
183dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
184dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
185dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (count >= 4) {
186dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Load 4 pixels
187dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
188dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
189dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
190dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
191f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
192dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
193f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            // (a0, g0, a1, g1, a2, g2, a3, g3)  (low byte of each word)
194f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            __m128i alpha = _mm_srli_epi16(src_pixel, 8);
195f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
196f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            // (a0, a0, a1, a1, a2, g2, a3, g3)
197f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            alpha = _mm_shufflehi_epi16(alpha, 0xF5);
198f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
199f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            // (a0, a0, a1, a1, a2, a2, a3, a3)
200f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            alpha = _mm_shufflelo_epi16(alpha, 0xF5);
201dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
202dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Subtract alphas from 256, to get 1..256
203dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            alpha = _mm_sub_epi16(c_256, alpha);
204dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
205dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by red and blue by src alpha.
206dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
207dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by alpha and green by src alpha.
208dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
209dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
210dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Divide by 256.
211dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_srli_epi16(dst_rb, 8);
212dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
213dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Mask out high bits (already in the right place)
214dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
215dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
216dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Combine back into RGBA.
217dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
218dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
219dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Add result
220dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
221dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            _mm_store_si128(d, result);
222dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            s++;
223dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            d++;
224dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count -= 4;
225dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
2269272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#endif
227dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
228dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        dst = reinterpret_cast<SkPMColor*>(d);
229dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    }
2309272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
2319272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    while (count > 0) {
2329272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        *dst = SkPMSrcOver(*src, *dst);
2339272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        src++;
2349272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        dst++;
2359272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        count--;
2369272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
2379272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org}
2389272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
2394e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
2404e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                               const SkPMColor* SK_RESTRICT src,
2414e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                               int count, U8CPU alpha) {
2429272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    SkASSERT(alpha <= 255);
2439272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    if (count <= 0) {
2449272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        return;
2459272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
2469272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
247dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    if (count >= 4) {
248dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (((size_t)dst & 0x0F) != 0) {
249dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            *dst = SkBlendARGB32(*src, *dst, alpha);
250dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src++;
251dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst++;
252dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count--;
253dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
254dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
255dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        uint32_t src_scale = SkAlpha255To256(alpha);
256dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
257dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        const __m128i *s = reinterpret_cast<const __m128i*>(src);
258dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i *d = reinterpret_cast<__m128i*>(dst);
25998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com        __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
260dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
261dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
262dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (count >= 4) {
263dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Load 4 pixels each of src and dest.
264dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
265dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
266dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
267dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Get red and blue pixels into lower byte of each word.
268dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
269dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
270dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
271dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Get alpha and green into lower byte of each word.
272dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
273dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
274dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
275dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Put per-pixel alpha in low byte of each word.
27698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // After the following two statements, the dst_alpha looks like
27798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
278dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
279dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
280dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
281dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // dst_alpha = dst_alpha * src_scale
28298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Because src_scales are in the higher byte of each word and
28398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // we use mulhi here, the resulting alpha values are already
28498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // in the right place and don't need to be divided by 256.
28598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
28698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
287dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
288dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Subtract alphas from 256, to get 1..256
289dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
290dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
291dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply red and blue by dst pixel alpha.
292dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
293dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply alpha and green by dst pixel alpha.
294dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
295dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
296dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply red and blue by global alpha.
29798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (4 x (0, rs.h, 0, bs.h))
29898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // where rs.h stands for the higher byte of r * src_scale,
29998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // and bs.h the higher byte of b * src_scale.
30098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Again, because we use mulhi, the resuling red and blue
30198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // values are already in the right place and don't need to
30298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // be divided by 256.
30398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
304dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply alpha and green by global alpha.
30598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (4 x (0, as.h, 0, gs.h))
30698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
307dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
308dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Divide by 256.
309dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_srli_epi16(dst_rb, 8);
310dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
311dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Mask out low bits (goodies already in the right place; no need to divide)
312dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
31398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Shift alpha and green to higher byte of each word.
31498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (4 x (as.h, 0, gs.h, 0))
31598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            src_ag = _mm_slli_epi16(src_ag, 8);
316dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
317dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Combine back into RGBA.
318dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
319dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_pixel = _mm_or_si128(src_rb, src_ag);
320dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
321dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Add two pixels into result.
322dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
323dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            _mm_store_si128(d, result);
324dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            s++;
325dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            d++;
326dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count -= 4;
327dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
328dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
329dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        dst = reinterpret_cast<SkPMColor*>(d);
3309272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
331dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
3329272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    while (count > 0) {
3339272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        *dst = SkBlendARGB32(*src, *dst, alpha);
3349272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        src++;
3359272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        dst++;
3369272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        count--;
3379272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
3389272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org}
339c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
340c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org/* SSE2 version of Color32()
341c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org * portable version is in core/SkBlitRow_D32.cpp
342c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org */
343c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.orgvoid Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
344c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                  SkPMColor color) {
345c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
346c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    if (count <= 0) {
347c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        return;
348c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    }
349c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
350c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    if (0 == color) {
351c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        if (src != dst) {
352c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            memcpy(dst, src, count * sizeof(SkPMColor));
353c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        }
354c909a1ecadd422d91ff97d10ce08865290223b14reed@google.com        return;
355c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    }
356c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
357c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    unsigned colorA = SkGetPackedA32(color);
358c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    if (255 == colorA) {
359c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        sk_memset32(dst, color, count);
360c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    } else {
361c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        unsigned scale = 256 - SkAlpha255To256(colorA);
362c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
363c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        if (count >= 4) {
364c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            SkASSERT(((size_t)dst & 0x03) == 0);
365c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            while (((size_t)dst & 0x0F) != 0) {
366c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                *dst = color + SkAlphaMulQ(*src, scale);
367c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src++;
368c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                dst++;
369c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                count--;
370c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            }
371c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
372c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            const __m128i *s = reinterpret_cast<const __m128i*>(src);
373c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            __m128i *d = reinterpret_cast<__m128i*>(dst);
374c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
375c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            __m128i src_scale_wide = _mm_set1_epi16(scale);
376c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            __m128i color_wide = _mm_set1_epi32(color);
377c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            while (count >= 4) {
378c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Load 4 pixels each of src and dest.
379c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                __m128i src_pixel = _mm_loadu_si128(s);
380c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
381c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Get red and blue pixels into lower byte of each word.
382c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
383981d4798007b91e2e19c13b171583927a56df63breed@google.com
384c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Get alpha and green into lower byte of each word.
385c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
386c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
387c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Multiply by scale.
388c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
389c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
390c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
391c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Divide by 256.
392c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src_rb = _mm_srli_epi16(src_rb, 8);
393c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src_ag = _mm_andnot_si128(rb_mask, src_ag);
394c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
395c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Combine back into RGBA.
396c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src_pixel = _mm_or_si128(src_rb, src_ag);
397c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
398c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Add color to result.
399c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                __m128i result = _mm_add_epi8(color_wide, src_pixel);
400c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
401c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Store result.
402c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                _mm_store_si128(d, result);
403c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                s++;
404c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                d++;
405c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                count -= 4;
406c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            }
407c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            src = reinterpret_cast<const SkPMColor*>(s);
408c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            dst = reinterpret_cast<SkPMColor*>(d);
409c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org         }
410c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
411c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        while (count > 0) {
412c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            *dst = color + SkAlphaMulQ(*src, scale);
413c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            src += 1;
414c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            dst += 1;
415c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            count--;
416981d4798007b91e2e19c13b171583927a56df63breed@google.com        }
417c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    }
418c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org}
419981d4798007b91e2e19c13b171583927a56df63breed@google.com
420edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.comvoid SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
421edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.com                               size_t maskRB, SkColor origColor,
422d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                               int width, int height) {
423ee467ee79d449ebe6ae7f7946e613cc70a479c69reed@google.com    SkPMColor color = SkPreMultiplyColor(origColor);
424981d4798007b91e2e19c13b171583927a56df63breed@google.com    size_t dstOffset = dstRB - (width << 2);
425981d4798007b91e2e19c13b171583927a56df63breed@google.com    size_t maskOffset = maskRB - width;
426981d4798007b91e2e19c13b171583927a56df63breed@google.com    SkPMColor* dst = (SkPMColor *)device;
427edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.com    const uint8_t* mask = (const uint8_t*)maskPtr;
428981d4798007b91e2e19c13b171583927a56df63breed@google.com    do {
429981d4798007b91e2e19c13b171583927a56df63breed@google.com        int count = width;
430981d4798007b91e2e19c13b171583927a56df63breed@google.com        if (count >= 4) {
431981d4798007b91e2e19c13b171583927a56df63breed@google.com            while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
432981d4798007b91e2e19c13b171583927a56df63breed@google.com                *dst = SkBlendARGB32(color, *dst, *mask);
433981d4798007b91e2e19c13b171583927a56df63breed@google.com                mask++;
434981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst++;
435981d4798007b91e2e19c13b171583927a56df63breed@google.com                count--;
436981d4798007b91e2e19c13b171583927a56df63breed@google.com            }
437981d4798007b91e2e19c13b171583927a56df63breed@google.com            __m128i *d = reinterpret_cast<__m128i*>(dst);
438981d4798007b91e2e19c13b171583927a56df63breed@google.com            __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
439981d4798007b91e2e19c13b171583927a56df63breed@google.com            __m128i c_256 = _mm_set1_epi16(256);
440981d4798007b91e2e19c13b171583927a56df63breed@google.com            __m128i c_1 = _mm_set1_epi16(1);
441981d4798007b91e2e19c13b171583927a56df63breed@google.com            __m128i src_pixel = _mm_set1_epi32(color);
442981d4798007b91e2e19c13b171583927a56df63breed@google.com            while (count >= 4) {
443981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Load 4 pixels each of src and dest.
444981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i dst_pixel = _mm_load_si128(d);
445981d4798007b91e2e19c13b171583927a56df63breed@google.com
446981d4798007b91e2e19c13b171583927a56df63breed@google.com                //set the aphla value
447981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i src_scale_wide =  _mm_set_epi8(0, *(mask+3),\
448981d4798007b91e2e19c13b171583927a56df63breed@google.com                                0, *(mask+3),0, \
449981d4798007b91e2e19c13b171583927a56df63breed@google.com                                *(mask+2),0, *(mask+2),\
450981d4798007b91e2e19c13b171583927a56df63breed@google.com                                0,*(mask+1), 0,*(mask+1),\
451981d4798007b91e2e19c13b171583927a56df63breed@google.com                                0, *mask,0,*mask);
452981d4798007b91e2e19c13b171583927a56df63breed@google.com
453981d4798007b91e2e19c13b171583927a56df63breed@google.com                //call SkAlpha255To256()
454981d4798007b91e2e19c13b171583927a56df63breed@google.com                src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
455981d4798007b91e2e19c13b171583927a56df63breed@google.com
456981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Get red and blue pixels into lower byte of each word.
457981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
458981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
459981d4798007b91e2e19c13b171583927a56df63breed@google.com
460981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Get alpha and green into lower byte of each word.
461981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
462981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
463981d4798007b91e2e19c13b171583927a56df63breed@google.com
464981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Put per-pixel alpha in low byte of each word.
465981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
466981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
467981d4798007b91e2e19c13b171583927a56df63breed@google.com
468981d4798007b91e2e19c13b171583927a56df63breed@google.com                // dst_alpha = dst_alpha * src_scale
469981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
470981d4798007b91e2e19c13b171583927a56df63breed@google.com
471981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Divide by 256.
472981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_alpha = _mm_srli_epi16(dst_alpha, 8);
473981d4798007b91e2e19c13b171583927a56df63breed@google.com
474981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Subtract alphas from 256, to get 1..256
475981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
476981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Multiply red and blue by dst pixel alpha.
477981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
478981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Multiply alpha and green by dst pixel alpha.
479981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
480981d4798007b91e2e19c13b171583927a56df63breed@google.com
481981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Multiply red and blue by global alpha.
482981d4798007b91e2e19c13b171583927a56df63breed@google.com                src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
483981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Multiply alpha and green by global alpha.
484981d4798007b91e2e19c13b171583927a56df63breed@google.com                src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
485981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Divide by 256.
486981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_rb = _mm_srli_epi16(dst_rb, 8);
487981d4798007b91e2e19c13b171583927a56df63breed@google.com                src_rb = _mm_srli_epi16(src_rb, 8);
488981d4798007b91e2e19c13b171583927a56df63breed@google.com
489981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Mask out low bits (goodies already in the right place; no need to divide)
490981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
491981d4798007b91e2e19c13b171583927a56df63breed@google.com                src_ag = _mm_andnot_si128(rb_mask, src_ag);
492981d4798007b91e2e19c13b171583927a56df63breed@google.com
493981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Combine back into RGBA.
494981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_pixel = _mm_or_si128(dst_rb, dst_ag);
495981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
496981d4798007b91e2e19c13b171583927a56df63breed@google.com
497981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Add two pixels into result.
498981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
499981d4798007b91e2e19c13b171583927a56df63breed@google.com                _mm_store_si128(d, result);
500981d4798007b91e2e19c13b171583927a56df63breed@google.com                // load the next 4 pixel
501981d4798007b91e2e19c13b171583927a56df63breed@google.com                mask = mask + 4;
502981d4798007b91e2e19c13b171583927a56df63breed@google.com                d++;
503981d4798007b91e2e19c13b171583927a56df63breed@google.com                count -= 4;
504981d4798007b91e2e19c13b171583927a56df63breed@google.com            }
505981d4798007b91e2e19c13b171583927a56df63breed@google.com            dst = reinterpret_cast<SkPMColor *>(d);
506981d4798007b91e2e19c13b171583927a56df63breed@google.com        }
507981d4798007b91e2e19c13b171583927a56df63breed@google.com        while(count > 0) {
508981d4798007b91e2e19c13b171583927a56df63breed@google.com            *dst= SkBlendARGB32(color, *dst, *mask);
509981d4798007b91e2e19c13b171583927a56df63breed@google.com            dst += 1;
510981d4798007b91e2e19c13b171583927a56df63breed@google.com            mask++;
511981d4798007b91e2e19c13b171583927a56df63breed@google.com            count --;
512981d4798007b91e2e19c13b171583927a56df63breed@google.com        }
513981d4798007b91e2e19c13b171583927a56df63breed@google.com        dst = (SkPMColor *)((char*)dst + dstOffset);
514981d4798007b91e2e19c13b171583927a56df63breed@google.com        mask += maskOffset;
515981d4798007b91e2e19c13b171583927a56df63breed@google.com    } while (--height != 0);
516981d4798007b91e2e19c13b171583927a56df63breed@google.com}
517d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
5188cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com// The following (left) shifts cause the top 5 bits of the mask components to
5198cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com// line up with the corresponding components in an SkPMColor.
5208cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com// Note that the mask's RGB16 order may differ from the SkPMColor order.
5218cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
5228cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
5238cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
5248cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com
5258cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#if SK_R16x5_R32x5_SHIFT == 0
5268cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
5278cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#elif SK_R16x5_R32x5_SHIFT > 0
5288cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
5298cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#else
5308cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
5318cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#endif
5328cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com
5338cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#if SK_G16x5_G32x5_SHIFT == 0
5348cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
5358cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#elif SK_G16x5_G32x5_SHIFT > 0
5368cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
5378cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#else
5388cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
5398cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#endif
5408cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com
5418cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#if SK_B16x5_B32x5_SHIFT == 0
5428cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
5438cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#elif SK_B16x5_B32x5_SHIFT > 0
5448cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
5458cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#else
5468cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
5478cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#endif
5488cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com
54976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgstatic __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
55076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                                 __m128i &mask, __m128i &srcA) {
55176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // In the following comments, the components of src, dst and mask are
55276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
55376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // by an R, G, B, or A suffix. Components of one of the four pixels that
55476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
55576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // example is the blue channel of the second destination pixel. Memory
55676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // layout is shown for an ARGB byte order in a color value.
55776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org
55876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // src and srcA store 8-bit values interleaved with zeros.
55976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
56076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
56176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
56276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask stores 16-bit values (compressed three channels) interleaved with zeros.
56376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
56476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
56576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
56676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org
567d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
56876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
5698cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
5708cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
5718cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com
57276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
5738cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
5748cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
575fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
57676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
5778cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
5788cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
579fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
580d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
58176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
58276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // 8-bit position
58376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
58476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
585d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    mask = _mm_or_si128(_mm_or_si128(r, g), b);
586d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
587fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    // Interleave R,G,B into the lower byte of word.
58876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // i.e. split the sixteen 8-bit values from mask into two sets of eight
58976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // 16-bit values, padded by zero.
590d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i maskLo, maskHi;
59176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
592d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
59376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
594d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
595d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
59676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Upscale from 0..31 to 0..32
59776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // (allows to replace division by left-shift further down)
59876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Left-shift each component by 4 and add the result back to that component,
59976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
600d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
601d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
602d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
60376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Multiply each component of maskLo and maskHi by srcA
60476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    maskLo = _mm_mullo_epi16(maskLo, srcA);
60576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    maskHi = _mm_mullo_epi16(maskHi, srcA);
606d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
60776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Left shift mask components by 8 (divide by 256)
608d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskLo = _mm_srli_epi16(maskLo, 8);
609d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskHi = _mm_srli_epi16(maskHi, 8);
610d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
61176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Interleave R,G,B into the lower byte of the word
61276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
613d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
61476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
615d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
616d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
61776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (src - dst) * mask
61876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
61976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
620d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
62176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (src - dst) * mask >> 5
622d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskLo = _mm_srai_epi16(maskLo, 5);
623d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskHi = _mm_srai_epi16(maskHi, 5);
624d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
625d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    // Add two pixels into result.
62676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // result = dst + ((src - dst) * mask >> 5)
627d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
628d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
629d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
63076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Pack into 4 32bit dst pixels.
63176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
63276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
63376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // clamping to 255 if necessary.
634d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    return _mm_packus_epi16(resultLo, resultHi);
635d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com}
636d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
63776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgstatic __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
638d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                                       __m128i &mask) {
63976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // In the following comments, the components of src, dst and mask are
64076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
64176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // by an R, G, B, or A suffix. Components of one of the four pixels that
64276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
64376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // example is the blue channel of the second destination pixel. Memory
64476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // layout is shown for an ARGB byte order in a color value.
64576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org
64676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // src and srcA store 8-bit values interleaved with zeros.
64776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
64876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask stores 16-bit values (shown as high and low bytes) interleaved with
64976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // zeros
65076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
65176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
65276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org
653d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
65476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
6558cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
6568cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
657d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
65876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
6598cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
6608cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
661fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
66276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
6638cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
6648cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
665d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
666d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
66776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
66876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // 8-bit position
66976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
67076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
671d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    mask = _mm_or_si128(_mm_or_si128(r, g), b);
672d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
673fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    // Interleave R,G,B into the lower byte of word.
67476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // i.e. split the sixteen 8-bit values from mask into two sets of eight
67576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // 16-bit values, padded by zero.
676d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i maskLo, maskHi;
67776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
678d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
67976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
680d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
681d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
68276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Upscale from 0..31 to 0..32
68376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // (allows to replace division by left-shift further down)
68476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Left-shift each component by 4 and add the result back to that component,
68576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
686d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
687d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
688d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
68976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Interleave R,G,B into the lower byte of the word
69076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
691d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
69276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
693d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
694d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
69576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (src - dst) * mask
69676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
69776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
698d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
69976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (src - dst) * mask >> 5
700d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskLo = _mm_srai_epi16(maskLo, 5);
701d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskHi = _mm_srai_epi16(maskHi, 5);
702d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
703d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    // Add two pixels into result.
70476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // result = dst + ((src - dst) * mask >> 5)
705d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
706d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
707d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
70827123cd59fb6d6fc240d327efd9fc068c0e3a495bungeman@google.com    // Pack into 4 32bit dst pixels and force opaque.
70976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
71076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
71176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // clamping to 255 if necessary. Set alpha components to 0xFF.
71227123cd59fb6d6fc240d327efd9fc068c0e3a495bungeman@google.com    return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
71327123cd59fb6d6fc240d327efd9fc068c0e3a495bungeman@google.com                        _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
714d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com}
715d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
71676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgvoid SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
71776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                         SkColor src, int width, SkPMColor) {
718d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    if (width <= 0) {
719d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        return;
720d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    }
721d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
72276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    int srcA = SkColorGetA(src);
72376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    int srcR = SkColorGetR(src);
72476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    int srcG = SkColorGetG(src);
72576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    int srcB = SkColorGetB(src);
726fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
727d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    srcA = SkAlpha255To256(srcA);
728d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
729d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    if (width >= 4) {
730d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        SkASSERT(((size_t)dst & 0x03) == 0);
731d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        while (((size_t)dst & 0x0F) != 0) {
73276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
73376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            mask++;
734d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            dst++;
735d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            width--;
736d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        }
737d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
738d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        __m128i *d = reinterpret_cast<__m128i*>(dst);
73976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        // Set alpha to 0xFF and replicate source four times in SSE register.
74076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
74176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        // Interleave with zeros to get two sets of four 16-bit values.
74276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
74376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        // Set srcA_sse to contain eight copies of srcA, padded with zero.
74476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
74576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        __m128i srcA_sse = _mm_set1_epi16(srcA);
746d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        while (width >= 4) {
74776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // Load four destination pixels into dst_sse.
74876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            __m128i dst_sse = _mm_load_si128(d);
74976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // Load four 16-bit masks into lower half of mask_sse.
75076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            __m128i mask_sse = _mm_loadl_epi64(
75176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                                   reinterpret_cast<const __m128i*>(mask));
75276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org
75376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // Check whether masks are equal to 0 and get the highest bit
75476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // of each byte of result, if masks are all zero, we will get
755d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            // pack_cmp to 0xFFFF
75676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
757d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                                             _mm_setzero_si128()));
758d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
759d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            // if mask pixels are not all zero, we will blend the dst pixels
760d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            if (pack_cmp != 0xFFFF) {
761fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com                // Unpack 4 16bit mask pixels to
76276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
76376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
76476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                mask_sse = _mm_unpacklo_epi16(mask_sse,
76576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                                              _mm_setzero_si128());
766d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
767d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                // Process 4 32bit dst pixels
76876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
76976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                                                   mask_sse, srcA_sse);
770d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                _mm_store_si128(d, result);
771d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            }
772d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
773d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            d++;
77476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            mask += 4;
775d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            width -= 4;
776d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        }
777d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
778d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        dst = reinterpret_cast<SkPMColor*>(d);
779d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    }
780d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
781d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    while (width > 0) {
78276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
78376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        mask++;
784d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        dst++;
785fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        width--;
786d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    }
787d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com}
788d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
78976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgvoid SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
79076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                               SkColor src, int width, SkPMColor opaqueDst) {
791d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    if (width <= 0) {
792d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        return;
793d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    }
794d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
79576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    int srcR = SkColorGetR(src);
79676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    int srcG = SkColorGetG(src);
79776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    int srcB = SkColorGetB(src);
798d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
799d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    if (width >= 4) {
800d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        SkASSERT(((size_t)dst & 0x03) == 0);
801d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        while (((size_t)dst & 0x0F) != 0) {
80276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
80376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            mask++;
804d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            dst++;
805d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            width--;
806d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        }
807d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
808d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        __m128i *d = reinterpret_cast<__m128i*>(dst);
80976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        // Set alpha to 0xFF and replicate source four times in SSE register.
81076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
81176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        // Set srcA_sse to contain eight copies of srcA, padded with zero.
81276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
81376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
814d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        while (width >= 4) {
81576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // Load four destination pixels into dst_sse.
81676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            __m128i dst_sse = _mm_load_si128(d);
81776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // Load four 16-bit masks into lower half of mask_sse.
81876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            __m128i mask_sse = _mm_loadl_epi64(
81976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                                   reinterpret_cast<const __m128i*>(mask));
82076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org
82176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // Check whether masks are equal to 0 and get the highest bit
82276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // of each byte of result, if masks are all zero, we will get
823d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            // pack_cmp to 0xFFFF
82476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
825d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                                             _mm_setzero_si128()));
826d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
827d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            // if mask pixels are not all zero, we will blend the dst pixels
828d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            if (pack_cmp != 0xFFFF) {
829fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com                // Unpack 4 16bit mask pixels to
83076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
83176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
83276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                mask_sse = _mm_unpacklo_epi16(mask_sse,
83376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                                              _mm_setzero_si128());
834d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
835d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                // Process 4 32bit dst pixels
83676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
83776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                                                         mask_sse);
838d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                _mm_store_si128(d, result);
839d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            }
840d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
841d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            d++;
84276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            mask += 4;
843d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            width -= 4;
844d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        }
845d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
846d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        dst = reinterpret_cast<SkPMColor*>(d);
847d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    }
848d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
849d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    while (width > 0) {
85076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
85176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        mask++;
852d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        dst++;
853fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        width--;
854d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    }
855d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com}
856475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
85739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org/* SSE2 version of S32_D565_Opaque()
85839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org * portable version is in core/SkBlitRow_D16.cpp
85939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org */
86039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.orgvoid S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
86139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                          const SkPMColor* SK_RESTRICT src, int count,
86239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                          U8CPU alpha, int /*x*/, int /*y*/) {
86339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org    SkASSERT(255 == alpha);
86439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
86539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org    if (count <= 0) {
86639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        return;
86739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org    }
86839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
86939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org    if (count >= 8) {
87039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        while (((size_t)dst & 0x0F) != 0) {
87139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            SkPMColor c = *src++;
87239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            SkPMColorAssert(c);
87339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
87439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            *dst++ = SkPixel32ToPixel16_ToU16(c);
87539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            count--;
87639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        }
87739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
87839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        const __m128i* s = reinterpret_cast<const __m128i*>(src);
87939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        __m128i* d = reinterpret_cast<__m128i*>(dst);
88039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK);
88139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK);
88239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK);
88339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
88439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        while (count >= 8) {
88539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            // Load 8 pixels of src.
88639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i src_pixel1 = _mm_loadu_si128(s++);
88739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i src_pixel2 = _mm_loadu_si128(s++);
88839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
88939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            // Calculate result r.
89039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i r1 = _mm_srli_epi32(src_pixel1,
89139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                                        SK_R32_SHIFT + (8 - SK_R16_BITS));
89239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            r1 = _mm_and_si128(r1, r16_mask);
89339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i r2 = _mm_srli_epi32(src_pixel2,
89439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                                        SK_R32_SHIFT + (8 - SK_R16_BITS));
89539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            r2 = _mm_and_si128(r2, r16_mask);
89639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i r = _mm_packs_epi32(r1, r2);
89739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
89839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            // Calculate result g.
89939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i g1 = _mm_srli_epi32(src_pixel1,
90039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                                        SK_G32_SHIFT + (8 - SK_G16_BITS));
90139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            g1 = _mm_and_si128(g1, g16_mask);
90239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i g2 = _mm_srli_epi32(src_pixel2,
90339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                                        SK_G32_SHIFT + (8 - SK_G16_BITS));
90439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            g2 = _mm_and_si128(g2, g16_mask);
90539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i g = _mm_packs_epi32(g1, g2);
90639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
90739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            // Calculate result b.
90839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i b1 = _mm_srli_epi32(src_pixel1,
90939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                                        SK_B32_SHIFT + (8 - SK_B16_BITS));
91039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            b1 = _mm_and_si128(b1, b16_mask);
91139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i b2 = _mm_srli_epi32(src_pixel2,
91239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                                        SK_B32_SHIFT + (8 - SK_B16_BITS));
91339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            b2 = _mm_and_si128(b2, b16_mask);
91439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i b = _mm_packs_epi32(b1, b2);
91539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
91639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            // Store 8 16-bit colors in dst.
91739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i d_pixel = SkPackRGB16_SSE(r, g, b);
91839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            _mm_store_si128(d++, d_pixel);
91939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            count -= 8;
92039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        }
92139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
92239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        dst = reinterpret_cast<uint16_t*>(d);
92339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org    }
92439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
92539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org    if (count > 0) {
92639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        do {
92739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            SkPMColor c = *src++;
92839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            SkPMColorAssert(c);
92939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            *dst++ = SkPixel32ToPixel16_ToU16(c);
93039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        } while (--count != 0);
93139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org    }
93239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org}
93339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
934475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org/* SSE2 version of S32A_D565_Opaque()
935475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org * portable version is in core/SkBlitRow_D16.cpp
936475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org */
937475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.orgvoid S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
938475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                           const SkPMColor* SK_RESTRICT src,
939475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                           int count, U8CPU alpha, int /*x*/, int /*y*/) {
940475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org    SkASSERT(255 == alpha);
941475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
942475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org    if (count <= 0) {
943475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        return;
944475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org    }
945475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
946475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org    if (count >= 8) {
947475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        // Make dst 16 bytes alignment
948475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        while (((size_t)dst & 0x0F) != 0) {
949475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            SkPMColor c = *src++;
950475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            if (c) {
951475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org              *dst = SkSrcOver32To16(c, *dst);
952475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            }
953475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            dst += 1;
954475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            count--;
955475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        }
956475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
957475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        const __m128i* s = reinterpret_cast<const __m128i*>(src);
958475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        __m128i* d = reinterpret_cast<__m128i*>(dst);
959475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        __m128i var255 = _mm_set1_epi16(255);
960475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
961475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
962475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
963475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
964475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        while (count >= 8) {
965475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Load 8 pixels of src.
966475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i src_pixel1 = _mm_loadu_si128(s++);
967475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i src_pixel2 = _mm_loadu_si128(s++);
968475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
969475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Check whether src pixels are equal to 0 and get the highest bit
970475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // of each byte of result, if src pixels are all zero, src_cmp1 and
971475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // src_cmp2 will be 0xFFFF.
972475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
973475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                                             _mm_setzero_si128()));
974475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
975475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                                             _mm_setzero_si128()));
976475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
977475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                d++;
978475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                count -= 8;
979475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                continue;
980475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            }
981475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
982475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Load 8 pixels of dst.
983475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
984475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
985475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Extract A from src.
986475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT));
987475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sa1 = _mm_srli_epi32(sa1, 24);
988475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT));
989475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sa2 = _mm_srli_epi32(sa2, 24);
990475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sa = _mm_packs_epi32(sa1, sa2);
991475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
992475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Extract R from src.
993475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sr1 = _mm_slli_epi32(src_pixel1,(24 - SK_R32_SHIFT));
994475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sr1 = _mm_srli_epi32(sr1, 24);
995475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sr2 = _mm_slli_epi32(src_pixel2,(24 - SK_R32_SHIFT));
996475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sr2 = _mm_srli_epi32(sr2, 24);
997475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sr = _mm_packs_epi32(sr1, sr2);
998475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
999475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Extract G from src.
1000475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sg1 = _mm_slli_epi32(src_pixel1,(24 - SK_G32_SHIFT));
1001475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sg1 = _mm_srli_epi32(sg1, 24);
1002475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sg2 = _mm_slli_epi32(src_pixel2,(24 - SK_G32_SHIFT));
1003475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sg2 = _mm_srli_epi32(sg2, 24);
1004475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sg = _mm_packs_epi32(sg1, sg2);
1005475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1006475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Extract B from src.
1007475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sb1 = _mm_slli_epi32(src_pixel1,(24 - SK_B32_SHIFT));
1008475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sb1 = _mm_srli_epi32(sb1, 24);
1009475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sb2 = _mm_slli_epi32(src_pixel2,(24 - SK_B32_SHIFT));
1010475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sb2 = _mm_srli_epi32(sb2, 24);
1011475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sb = _mm_packs_epi32(sb1, sb2);
1012475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1013475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Extract R G B from dst.
1014475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i dr = _mm_srli_epi16(dst_pixel,SK_R16_SHIFT);
1015475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            dr = _mm_and_si128(dr, r16_mask);
1016475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i dg = _mm_srli_epi16(dst_pixel,SK_G16_SHIFT);
1017475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            dg = _mm_and_si128(dg, g16_mask);
1018475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i db = _mm_srli_epi16(dst_pixel,SK_B16_SHIFT);
1019475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            db = _mm_and_si128(db, b16_mask);
1020475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1021475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
1022475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1023475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Calculate R G B of result.
1024475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Original algorithm is in SkSrcOver32To16().
1025475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE(dr, isa, SK_R16_BITS));
1026475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
1027475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE(dg, isa, SK_G16_BITS));
1028475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
1029475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE(db, isa, SK_B16_BITS));
1030475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
1031475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1032475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Pack R G B into 16-bit color.
1033475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db);
1034475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1035475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Store 8 16-bit colors in dst.
1036475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            _mm_store_si128(d++, d_pixel);
1037475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            count -= 8;
1038475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        }
1039475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1040475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
1041475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        dst = reinterpret_cast<uint16_t*>(d);
1042475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org    }
1043475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1044475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org    if (count > 0) {
1045475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        do {
1046475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            SkPMColor c = *src++;
1047475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            SkPMColorAssert(c);
1048475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            if (c) {
1049475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                *dst = SkSrcOver32To16(c, *dst);
1050475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            }
1051475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            dst += 1;
1052475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        } while (--count != 0);
1053475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org    }
1054475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org}
1055275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1056275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.orgvoid S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
1057275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org                                 const SkPMColor* SK_RESTRICT src,
1058275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org                                 int count, U8CPU alpha, int x, int y) {
1059275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org    SkASSERT(255 == alpha);
1060275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1061275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org    if (count <= 0) {
1062275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        return;
1063275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org    }
1064275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1065275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org    if (count >= 8) {
1066275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        while (((size_t)dst & 0x0F) != 0) {
1067275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            DITHER_565_SCAN(y);
1068275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            SkPMColor c = *src++;
1069275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            SkPMColorAssert(c);
1070275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1071275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            unsigned dither = DITHER_VALUE(x);
1072275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            *dst++ = SkDitherRGB32To565(c, dither);
1073275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            DITHER_INC_X(x);
1074275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            count--;
1075275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        }
1076275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1077275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        unsigned short dither_value[8];
1078275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        __m128i dither;
1079275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org#ifdef ENABLE_DITHER_MATRIX_4X4
1080275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
1081275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
1082275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
1083275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
1084275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
1085275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org#else
1086275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
1087275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        dither_value[0] = dither_value[4] = (dither_scan
1088275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org                                             >> (((x) & 3) << 2)) & 0xF;
1089275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        dither_value[1] = dither_value[5] = (dither_scan
1090275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org                                             >> (((x + 1) & 3) << 2)) & 0xF;
1091275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        dither_value[2] = dither_value[6] = (dither_scan
1092275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org                                             >> (((x + 2) & 3) << 2)) & 0xF;
1093275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        dither_value[3] = dither_value[7] = (dither_scan
1094275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org                                             >> (((x + 3) & 3) << 2)) & 0xF;
1095275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org#endif
1096275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        dither = _mm_loadu_si128((__m128i*) dither_value);
1097275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1098275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        const __m128i* s = reinterpret_cast<const __m128i*>(src);
1099275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        __m128i* d = reinterpret_cast<__m128i*>(dst);
1100275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1101275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        while (count >= 8) {
1102275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            // Load 8 pixels of src.
1103275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i src_pixel1 = _mm_loadu_si128(s++);
1104275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i src_pixel2 = _mm_loadu_si128(s++);
1105275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1106275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            // Extract R from src.
1107275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
1108275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sr1 = _mm_srli_epi32(sr1, 24);
1109275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
1110275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sr2 = _mm_srli_epi32(sr2, 24);
1111275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sr = _mm_packs_epi32(sr1, sr2);
1112275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1113275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            // SkDITHER_R32To565(sr, dither)
1114275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sr_offset = _mm_srli_epi16(sr, 5);
1115275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sr = _mm_add_epi16(sr, dither);
1116275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sr = _mm_sub_epi16(sr, sr_offset);
1117275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
1118275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1119275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            // Extract G from src.
1120275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
1121275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sg1 = _mm_srli_epi32(sg1, 24);
1122275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
1123275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sg2 = _mm_srli_epi32(sg2, 24);
1124275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sg = _mm_packs_epi32(sg1, sg2);
1125275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1126275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            // SkDITHER_R32To565(sg, dither)
1127275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sg_offset = _mm_srli_epi16(sg, 6);
1128275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
1129275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sg = _mm_sub_epi16(sg, sg_offset);
1130275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
1131275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1132275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            // Extract B from src.
1133275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
1134275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sb1 = _mm_srli_epi32(sb1, 24);
1135275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
1136275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sb2 = _mm_srli_epi32(sb2, 24);
1137275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sb = _mm_packs_epi32(sb1, sb2);
1138275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1139275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            // SkDITHER_R32To565(sb, dither)
1140275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sb_offset = _mm_srli_epi16(sb, 5);
1141275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sb = _mm_add_epi16(sb, dither);
1142275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sb = _mm_sub_epi16(sb, sb_offset);
1143275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
1144275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1145275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            // Pack and store 16-bit dst pixel.
1146275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i d_pixel = SkPackRGB16_SSE(sr, sg, sb);
1147275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            _mm_store_si128(d++, d_pixel);
1148275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1149275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            count -= 8;
1150275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            x += 8;
1151275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        }
1152275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1153275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
1154275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        dst = reinterpret_cast<uint16_t*>(d);
1155275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org    }
1156275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1157275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org    if (count > 0) {
1158275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        DITHER_565_SCAN(y);
1159275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        do {
1160275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            SkPMColor c = *src++;
1161275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            SkPMColorAssert(c);
1162275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1163275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            unsigned dither = DITHER_VALUE(x);
1164275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            *dst++ = SkDitherRGB32To565(c, dither);
1165275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            DITHER_INC_X(x);
1166275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        } while (--count != 0);
1167275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org    }
1168275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org}
1169