SkBlitRow_opts_SSE2.cpp revision 39ce33a1facae795eb2f02e35674702de7eb23b5
19272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org/*
298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com * Copyright 2012 The Android Open Source Project
3ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com *
4ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * Use of this source code is governed by a BSD-style license that can be
5ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * found in the LICENSE file.
69272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org */
79272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
8ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com
94e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org#include "SkBlitRow_opts_SSE2.h"
1083ecdc3ac69c9208493c4c3fc8ea9f84b1350535caryclark@google.com#include "SkBitmapProcState_opts_SSE2.h"
119272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#include "SkColorPriv.h"
12475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org#include "SkColor_opts_SSE2.h"
13c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org#include "SkUtils.h"
149272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
159272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#include <emmintrin.h>
169272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
179272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org/* SSE2 version of S32_Blend_BlitRow32()
189272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org * portable version is in core/SkBlitRow_D32.cpp
199272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org */
204e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
214e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                              const SkPMColor* SK_RESTRICT src,
224e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                              int count, U8CPU alpha) {
239272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    SkASSERT(alpha <= 255);
249272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    if (count <= 0) {
259272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        return;
269272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
279272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
289272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    uint32_t src_scale = SkAlpha255To256(alpha);
299272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    uint32_t dst_scale = 256 - src_scale;
309272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
31dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    if (count >= 4) {
32dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        SkASSERT(((size_t)dst & 0x03) == 0);
33dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (((size_t)dst & 0x0F) != 0) {
34dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
35dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src++;
36dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst++;
37dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count--;
38dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
39dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
40dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        const __m128i *s = reinterpret_cast<const __m128i*>(src);
41dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i *d = reinterpret_cast<__m128i*>(dst);
42dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
4398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com        __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
4498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com
4598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com        // Move scale factors to upper byte of word
4698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com        __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
4798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com        __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
48dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (count >= 4) {
49dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Load 4 pixels each of src and dest.
50dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
51dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
52dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
5398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Interleave Atom port 0/1 operations based on the execution port
5498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // constraints that multiply can only be executed on port 0 (while
5598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // boolean operations can be executed on either port 0 or port 1)
5698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // because GCC currently doesn't do a good job scheduling
5798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // instructions based on these constraints.
5898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com
59dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Get red and blue pixels into lower byte of each word.
6098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
61dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
62dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
6398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Multiply by scale.
6498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (4 x (0, rs.h, 0, bs.h))
6598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // where rs.h stands for the higher byte of r * scale, and
6698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // bs.h the higher byte of b * scale.
6798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
6898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com
6998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Get alpha and green pixels into higher byte of each word.
7098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
7198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
72dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
73dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by scale.
7498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (4 x (as.h, as.l, gs.h, gs.l))
7598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
76dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
7798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Clear the lower byte of the a*scale and g*scale results
7898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (4 x (as.h, 0, gs.h, 0))
7998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            src_ag = _mm_and_si128(src_ag, ag_mask);
8098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com
8198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Operations the destination pixels are the same as on the
8298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // source pixels. See the comments above.
8398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
8498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
8598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
8698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
8798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            dst_ag = _mm_and_si128(dst_ag, ag_mask);
88dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
89dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Combine back into RGBA.
9098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (4 x (as.h, rs.h, gs.h, bs.h))
91dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_pixel = _mm_or_si128(src_rb, src_ag);
92dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
93dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
94dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Add result
95dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
96dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            _mm_store_si128(d, result);
97dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            s++;
98dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            d++;
99dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count -= 4;
100dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
101dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
102dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        dst = reinterpret_cast<SkPMColor*>(d);
1039272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
1049272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
1054e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org    while (count > 0) {
1069272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
1079272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        src++;
1089272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        dst++;
1099272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        count--;
1109272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
1119272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org}
1129272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
1134e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
1144e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                                const SkPMColor* SK_RESTRICT src,
1154e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                                int count, U8CPU alpha) {
1169272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    SkASSERT(alpha == 255);
1179272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    if (count <= 0) {
1189272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        return;
1199272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
120dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
121dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    if (count >= 4) {
122dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        SkASSERT(((size_t)dst & 0x03) == 0);
123dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (((size_t)dst & 0x0F) != 0) {
124dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            *dst = SkPMSrcOver(*src, *dst);
125dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src++;
126dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst++;
127dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count--;
128dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
129dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
130dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        const __m128i *s = reinterpret_cast<const __m128i*>(src);
131dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i *d = reinterpret_cast<__m128i*>(dst);
1329272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#ifdef SK_USE_ACCURATE_BLENDING
133dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
134dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
135dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
136dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (count >= 4) {
137dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Load 4 pixels
138dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
139dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
140dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
141dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
142f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
143dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Shift alphas down to lower 8 bits of each quad.
144dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i alpha = _mm_srli_epi32(src_pixel, 24);
145dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
146dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Copy alpha to upper 3rd byte of each quad
147dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
148dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
149dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Subtract alphas from 255, to get 0..255
150dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            alpha = _mm_sub_epi16(c_255, alpha);
151dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
152dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by red and blue by src alpha.
153dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
154dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by alpha and green by src alpha.
155dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
156dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
157dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // dst_rb_low = (dst_rb >> 8)
158dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
159dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
160dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
161dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
162dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
163dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_add_epi16(dst_rb, c_128);
164dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_srli_epi16(dst_rb, 8);
165dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
166dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
167dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
168dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_add_epi16(dst_ag, c_128);
169dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
170dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
171dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Combine back into RGBA.
172dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
173dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
174dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Add result
175dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
176dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            _mm_store_si128(d, result);
177dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            s++;
178dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            d++;
179dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count -= 4;
180dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
181dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    #else
182dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
183dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
184dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (count >= 4) {
185dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Load 4 pixels
186dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
187dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
188dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
189dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
190f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
191dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
192f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            // (a0, g0, a1, g1, a2, g2, a3, g3)  (low byte of each word)
193f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            __m128i alpha = _mm_srli_epi16(src_pixel, 8);
194f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
195f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            // (a0, a0, a1, a1, a2, g2, a3, g3)
196f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            alpha = _mm_shufflehi_epi16(alpha, 0xF5);
197f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
198f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            // (a0, a0, a1, a1, a2, a2, a3, a3)
199f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            alpha = _mm_shufflelo_epi16(alpha, 0xF5);
200dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
201dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Subtract alphas from 256, to get 1..256
202dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            alpha = _mm_sub_epi16(c_256, alpha);
203dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
204dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by red and blue by src alpha.
205dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
206dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by alpha and green by src alpha.
207dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
208dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
209dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Divide by 256.
210dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_srli_epi16(dst_rb, 8);
211dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
212dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Mask out high bits (already in the right place)
213dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
214dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
215dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Combine back into RGBA.
216dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
217dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
218dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Add result
219dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
220dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            _mm_store_si128(d, result);
221dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            s++;
222dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            d++;
223dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count -= 4;
224dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
2259272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#endif
226dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
227dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        dst = reinterpret_cast<SkPMColor*>(d);
228dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    }
2299272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
2309272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    while (count > 0) {
2319272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        *dst = SkPMSrcOver(*src, *dst);
2329272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        src++;
2339272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        dst++;
2349272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        count--;
2359272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
2369272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org}
2379272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
2384e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
2394e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                               const SkPMColor* SK_RESTRICT src,
2404e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                               int count, U8CPU alpha) {
2419272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    SkASSERT(alpha <= 255);
2429272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    if (count <= 0) {
2439272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        return;
2449272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
2459272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
246dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    if (count >= 4) {
247dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (((size_t)dst & 0x0F) != 0) {
248dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            *dst = SkBlendARGB32(*src, *dst, alpha);
249dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src++;
250dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst++;
251dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count--;
252dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
253dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
254dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        uint32_t src_scale = SkAlpha255To256(alpha);
255dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
256dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        const __m128i *s = reinterpret_cast<const __m128i*>(src);
257dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i *d = reinterpret_cast<__m128i*>(dst);
25898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com        __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
259dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
260dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
261dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (count >= 4) {
262dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Load 4 pixels each of src and dest.
263dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
264dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
265dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
266dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Get red and blue pixels into lower byte of each word.
267dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
268dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
269dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
270dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Get alpha and green into lower byte of each word.
271dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
272dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
273dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
274dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Put per-pixel alpha in low byte of each word.
27598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // After the following two statements, the dst_alpha looks like
27698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
277dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
278dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
279dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
280dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // dst_alpha = dst_alpha * src_scale
28198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Because src_scales are in the higher byte of each word and
28298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // we use mulhi here, the resulting alpha values are already
28398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // in the right place and don't need to be divided by 256.
28498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
28598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
286dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
287dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Subtract alphas from 256, to get 1..256
288dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
289dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
290dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply red and blue by dst pixel alpha.
291dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
292dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply alpha and green by dst pixel alpha.
293dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
294dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
295dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply red and blue by global alpha.
29698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (4 x (0, rs.h, 0, bs.h))
29798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // where rs.h stands for the higher byte of r * src_scale,
29898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // and bs.h the higher byte of b * src_scale.
29998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Again, because we use mulhi, the resuling red and blue
30098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // values are already in the right place and don't need to
30198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // be divided by 256.
30298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
303dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply alpha and green by global alpha.
30498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (4 x (0, as.h, 0, gs.h))
30598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
306dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
307dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Divide by 256.
308dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_srli_epi16(dst_rb, 8);
309dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
310dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Mask out low bits (goodies already in the right place; no need to divide)
311dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
31298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Shift alpha and green to higher byte of each word.
31398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (4 x (as.h, 0, gs.h, 0))
31498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            src_ag = _mm_slli_epi16(src_ag, 8);
315dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
316dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Combine back into RGBA.
317dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
318dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_pixel = _mm_or_si128(src_rb, src_ag);
319dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
320dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Add two pixels into result.
321dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
322dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            _mm_store_si128(d, result);
323dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            s++;
324dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            d++;
325dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count -= 4;
326dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
327dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
328dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        dst = reinterpret_cast<SkPMColor*>(d);
3299272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
330dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
3319272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    while (count > 0) {
3329272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        *dst = SkBlendARGB32(*src, *dst, alpha);
3339272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        src++;
3349272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        dst++;
3359272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        count--;
3369272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
3379272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org}
338c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
339c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org/* SSE2 version of Color32()
340c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org * portable version is in core/SkBlitRow_D32.cpp
341c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org */
342c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.orgvoid Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
343c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                  SkPMColor color) {
344c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
345c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    if (count <= 0) {
346c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        return;
347c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    }
348c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
349c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    if (0 == color) {
350c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        if (src != dst) {
351c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            memcpy(dst, src, count * sizeof(SkPMColor));
352c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        }
353c909a1ecadd422d91ff97d10ce08865290223b14reed@google.com        return;
354c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    }
355c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
356c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    unsigned colorA = SkGetPackedA32(color);
357c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    if (255 == colorA) {
358c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        sk_memset32(dst, color, count);
359c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    } else {
360c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        unsigned scale = 256 - SkAlpha255To256(colorA);
361c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
362c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        if (count >= 4) {
363c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            SkASSERT(((size_t)dst & 0x03) == 0);
364c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            while (((size_t)dst & 0x0F) != 0) {
365c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                *dst = color + SkAlphaMulQ(*src, scale);
366c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src++;
367c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                dst++;
368c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                count--;
369c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            }
370c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
371c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            const __m128i *s = reinterpret_cast<const __m128i*>(src);
372c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            __m128i *d = reinterpret_cast<__m128i*>(dst);
373c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
374c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            __m128i src_scale_wide = _mm_set1_epi16(scale);
375c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            __m128i color_wide = _mm_set1_epi32(color);
376c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            while (count >= 4) {
377c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Load 4 pixels each of src and dest.
378c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                __m128i src_pixel = _mm_loadu_si128(s);
379c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
380c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Get red and blue pixels into lower byte of each word.
381c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
382981d4798007b91e2e19c13b171583927a56df63breed@google.com
383c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Get alpha and green into lower byte of each word.
384c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
385c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
386c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Multiply by scale.
387c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
388c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
389c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
390c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Divide by 256.
391c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src_rb = _mm_srli_epi16(src_rb, 8);
392c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src_ag = _mm_andnot_si128(rb_mask, src_ag);
393c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
394c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Combine back into RGBA.
395c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src_pixel = _mm_or_si128(src_rb, src_ag);
396c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
397c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Add color to result.
398c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                __m128i result = _mm_add_epi8(color_wide, src_pixel);
399c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
400c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Store result.
401c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                _mm_store_si128(d, result);
402c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                s++;
403c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                d++;
404c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                count -= 4;
405c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            }
406c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            src = reinterpret_cast<const SkPMColor*>(s);
407c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            dst = reinterpret_cast<SkPMColor*>(d);
408c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org         }
409c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
410c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        while (count > 0) {
411c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            *dst = color + SkAlphaMulQ(*src, scale);
412c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            src += 1;
413c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            dst += 1;
414c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            count--;
415981d4798007b91e2e19c13b171583927a56df63breed@google.com        }
416c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    }
417c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org}
418981d4798007b91e2e19c13b171583927a56df63breed@google.com
419edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.comvoid SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
420edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.com                               size_t maskRB, SkColor origColor,
421d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                               int width, int height) {
422ee467ee79d449ebe6ae7f7946e613cc70a479c69reed@google.com    SkPMColor color = SkPreMultiplyColor(origColor);
423981d4798007b91e2e19c13b171583927a56df63breed@google.com    size_t dstOffset = dstRB - (width << 2);
424981d4798007b91e2e19c13b171583927a56df63breed@google.com    size_t maskOffset = maskRB - width;
425981d4798007b91e2e19c13b171583927a56df63breed@google.com    SkPMColor* dst = (SkPMColor *)device;
426edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.com    const uint8_t* mask = (const uint8_t*)maskPtr;
427981d4798007b91e2e19c13b171583927a56df63breed@google.com    do {
428981d4798007b91e2e19c13b171583927a56df63breed@google.com        int count = width;
429981d4798007b91e2e19c13b171583927a56df63breed@google.com        if (count >= 4) {
430981d4798007b91e2e19c13b171583927a56df63breed@google.com            while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
431981d4798007b91e2e19c13b171583927a56df63breed@google.com                *dst = SkBlendARGB32(color, *dst, *mask);
432981d4798007b91e2e19c13b171583927a56df63breed@google.com                mask++;
433981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst++;
434981d4798007b91e2e19c13b171583927a56df63breed@google.com                count--;
435981d4798007b91e2e19c13b171583927a56df63breed@google.com            }
436981d4798007b91e2e19c13b171583927a56df63breed@google.com            __m128i *d = reinterpret_cast<__m128i*>(dst);
437981d4798007b91e2e19c13b171583927a56df63breed@google.com            __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
438981d4798007b91e2e19c13b171583927a56df63breed@google.com            __m128i c_256 = _mm_set1_epi16(256);
439981d4798007b91e2e19c13b171583927a56df63breed@google.com            __m128i c_1 = _mm_set1_epi16(1);
440981d4798007b91e2e19c13b171583927a56df63breed@google.com            __m128i src_pixel = _mm_set1_epi32(color);
441981d4798007b91e2e19c13b171583927a56df63breed@google.com            while (count >= 4) {
442981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Load 4 pixels each of src and dest.
443981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i dst_pixel = _mm_load_si128(d);
444981d4798007b91e2e19c13b171583927a56df63breed@google.com
445981d4798007b91e2e19c13b171583927a56df63breed@google.com                //set the aphla value
446981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i src_scale_wide =  _mm_set_epi8(0, *(mask+3),\
447981d4798007b91e2e19c13b171583927a56df63breed@google.com                                0, *(mask+3),0, \
448981d4798007b91e2e19c13b171583927a56df63breed@google.com                                *(mask+2),0, *(mask+2),\
449981d4798007b91e2e19c13b171583927a56df63breed@google.com                                0,*(mask+1), 0,*(mask+1),\
450981d4798007b91e2e19c13b171583927a56df63breed@google.com                                0, *mask,0,*mask);
451981d4798007b91e2e19c13b171583927a56df63breed@google.com
452981d4798007b91e2e19c13b171583927a56df63breed@google.com                //call SkAlpha255To256()
453981d4798007b91e2e19c13b171583927a56df63breed@google.com                src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
454981d4798007b91e2e19c13b171583927a56df63breed@google.com
455981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Get red and blue pixels into lower byte of each word.
456981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
457981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
458981d4798007b91e2e19c13b171583927a56df63breed@google.com
459981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Get alpha and green into lower byte of each word.
460981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
461981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
462981d4798007b91e2e19c13b171583927a56df63breed@google.com
463981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Put per-pixel alpha in low byte of each word.
464981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
465981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
466981d4798007b91e2e19c13b171583927a56df63breed@google.com
467981d4798007b91e2e19c13b171583927a56df63breed@google.com                // dst_alpha = dst_alpha * src_scale
468981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
469981d4798007b91e2e19c13b171583927a56df63breed@google.com
470981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Divide by 256.
471981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_alpha = _mm_srli_epi16(dst_alpha, 8);
472981d4798007b91e2e19c13b171583927a56df63breed@google.com
473981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Subtract alphas from 256, to get 1..256
474981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
475981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Multiply red and blue by dst pixel alpha.
476981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
477981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Multiply alpha and green by dst pixel alpha.
478981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
479981d4798007b91e2e19c13b171583927a56df63breed@google.com
480981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Multiply red and blue by global alpha.
481981d4798007b91e2e19c13b171583927a56df63breed@google.com                src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
482981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Multiply alpha and green by global alpha.
483981d4798007b91e2e19c13b171583927a56df63breed@google.com                src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
484981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Divide by 256.
485981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_rb = _mm_srli_epi16(dst_rb, 8);
486981d4798007b91e2e19c13b171583927a56df63breed@google.com                src_rb = _mm_srli_epi16(src_rb, 8);
487981d4798007b91e2e19c13b171583927a56df63breed@google.com
488981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Mask out low bits (goodies already in the right place; no need to divide)
489981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
490981d4798007b91e2e19c13b171583927a56df63breed@google.com                src_ag = _mm_andnot_si128(rb_mask, src_ag);
491981d4798007b91e2e19c13b171583927a56df63breed@google.com
492981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Combine back into RGBA.
493981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_pixel = _mm_or_si128(dst_rb, dst_ag);
494981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
495981d4798007b91e2e19c13b171583927a56df63breed@google.com
496981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Add two pixels into result.
497981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
498981d4798007b91e2e19c13b171583927a56df63breed@google.com                _mm_store_si128(d, result);
499981d4798007b91e2e19c13b171583927a56df63breed@google.com                // load the next 4 pixel
500981d4798007b91e2e19c13b171583927a56df63breed@google.com                mask = mask + 4;
501981d4798007b91e2e19c13b171583927a56df63breed@google.com                d++;
502981d4798007b91e2e19c13b171583927a56df63breed@google.com                count -= 4;
503981d4798007b91e2e19c13b171583927a56df63breed@google.com            }
504981d4798007b91e2e19c13b171583927a56df63breed@google.com            dst = reinterpret_cast<SkPMColor *>(d);
505981d4798007b91e2e19c13b171583927a56df63breed@google.com        }
506981d4798007b91e2e19c13b171583927a56df63breed@google.com        while(count > 0) {
507981d4798007b91e2e19c13b171583927a56df63breed@google.com            *dst= SkBlendARGB32(color, *dst, *mask);
508981d4798007b91e2e19c13b171583927a56df63breed@google.com            dst += 1;
509981d4798007b91e2e19c13b171583927a56df63breed@google.com            mask++;
510981d4798007b91e2e19c13b171583927a56df63breed@google.com            count --;
511981d4798007b91e2e19c13b171583927a56df63breed@google.com        }
512981d4798007b91e2e19c13b171583927a56df63breed@google.com        dst = (SkPMColor *)((char*)dst + dstOffset);
513981d4798007b91e2e19c13b171583927a56df63breed@google.com        mask += maskOffset;
514981d4798007b91e2e19c13b171583927a56df63breed@google.com    } while (--height != 0);
515981d4798007b91e2e19c13b171583927a56df63breed@google.com}
516d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
5178cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com// The following (left) shifts cause the top 5 bits of the mask components to
5188cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com// line up with the corresponding components in an SkPMColor.
5198cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com// Note that the mask's RGB16 order may differ from the SkPMColor order.
5208cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
5218cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
5228cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
5238cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com
5248cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#if SK_R16x5_R32x5_SHIFT == 0
5258cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
5268cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#elif SK_R16x5_R32x5_SHIFT > 0
5278cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
5288cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#else
5298cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
5308cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#endif
5318cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com
5328cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#if SK_G16x5_G32x5_SHIFT == 0
5338cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
5348cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#elif SK_G16x5_G32x5_SHIFT > 0
5358cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
5368cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#else
5378cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
5388cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#endif
5398cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com
5408cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#if SK_B16x5_B32x5_SHIFT == 0
5418cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
5428cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#elif SK_B16x5_B32x5_SHIFT > 0
5438cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
5448cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#else
5458cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
5468cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#endif
5478cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com
54876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgstatic __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
54976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                                 __m128i &mask, __m128i &srcA) {
55076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // In the following comments, the components of src, dst and mask are
55176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
55276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // by an R, G, B, or A suffix. Components of one of the four pixels that
55376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
55476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // example is the blue channel of the second destination pixel. Memory
55576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // layout is shown for an ARGB byte order in a color value.
55676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org
55776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // src and srcA store 8-bit values interleaved with zeros.
55876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
55976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
56076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
56176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask stores 16-bit values (compressed three channels) interleaved with zeros.
56276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
56376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
56476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
56576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org
566d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
56776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
5688cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
5698cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
5708cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com
57176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
5728cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
5738cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
574fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
57576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
5768cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
5778cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
578fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
579d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
58076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
58176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // 8-bit position
58276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
58376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
584d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    mask = _mm_or_si128(_mm_or_si128(r, g), b);
585d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
586fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    // Interleave R,G,B into the lower byte of word.
58776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // i.e. split the sixteen 8-bit values from mask into two sets of eight
58876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // 16-bit values, padded by zero.
589d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i maskLo, maskHi;
59076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
591d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
59276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
593d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
594d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
59576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Upscale from 0..31 to 0..32
59676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // (allows to replace division by left-shift further down)
59776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Left-shift each component by 4 and add the result back to that component,
59876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
599d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
600d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
601d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
60276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Multiply each component of maskLo and maskHi by srcA
60376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    maskLo = _mm_mullo_epi16(maskLo, srcA);
60476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    maskHi = _mm_mullo_epi16(maskHi, srcA);
605d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
60676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Left shift mask components by 8 (divide by 256)
607d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskLo = _mm_srli_epi16(maskLo, 8);
608d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskHi = _mm_srli_epi16(maskHi, 8);
609d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
61076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Interleave R,G,B into the lower byte of the word
61176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
612d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
61376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
614d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
615d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
61676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (src - dst) * mask
61776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
61876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
619d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
62076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (src - dst) * mask >> 5
621d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskLo = _mm_srai_epi16(maskLo, 5);
622d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskHi = _mm_srai_epi16(maskHi, 5);
623d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
624d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    // Add two pixels into result.
62576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // result = dst + ((src - dst) * mask >> 5)
626d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
627d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
628d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
62976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Pack into 4 32bit dst pixels.
63076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
63176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
63276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // clamping to 255 if necessary.
633d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    return _mm_packus_epi16(resultLo, resultHi);
634d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com}
635d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
63676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgstatic __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
637d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                                       __m128i &mask) {
63876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // In the following comments, the components of src, dst and mask are
63976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
64076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // by an R, G, B, or A suffix. Components of one of the four pixels that
64176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
64276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // example is the blue channel of the second destination pixel. Memory
64376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // layout is shown for an ARGB byte order in a color value.
64476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org
64576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // src and srcA store 8-bit values interleaved with zeros.
64676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
64776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask stores 16-bit values (shown as high and low bytes) interleaved with
64876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // zeros
64976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
65076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
65176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org
652d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
65376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
6548cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
6558cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
656d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
65776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
6588cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
6598cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
660fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
66176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
6628cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
6638cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
664d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
665d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
66676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
66776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // 8-bit position
66876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
66976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
670d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    mask = _mm_or_si128(_mm_or_si128(r, g), b);
671d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
672fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    // Interleave R,G,B into the lower byte of word.
67376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // i.e. split the sixteen 8-bit values from mask into two sets of eight
67476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // 16-bit values, padded by zero.
675d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i maskLo, maskHi;
67676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
677d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
67876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
679d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
680d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
68176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Upscale from 0..31 to 0..32
68276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // (allows to replace division by left-shift further down)
68376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Left-shift each component by 4 and add the result back to that component,
68476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
685d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
686d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
687d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
68876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Interleave R,G,B into the lower byte of the word
68976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
690d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
69176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
692d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
693d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
69476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (src - dst) * mask
69576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
69676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
697d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
69876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (src - dst) * mask >> 5
699d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskLo = _mm_srai_epi16(maskLo, 5);
700d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskHi = _mm_srai_epi16(maskHi, 5);
701d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
702d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    // Add two pixels into result.
70376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // result = dst + ((src - dst) * mask >> 5)
704d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
705d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
706d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
70727123cd59fb6d6fc240d327efd9fc068c0e3a495bungeman@google.com    // Pack into 4 32bit dst pixels and force opaque.
70876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
70976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
71076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // clamping to 255 if necessary. Set alpha components to 0xFF.
71127123cd59fb6d6fc240d327efd9fc068c0e3a495bungeman@google.com    return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
71227123cd59fb6d6fc240d327efd9fc068c0e3a495bungeman@google.com                        _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
713d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com}
714d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
71576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgvoid SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
71676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                         SkColor src, int width, SkPMColor) {
717d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    if (width <= 0) {
718d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        return;
719d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    }
720d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
72176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    int srcA = SkColorGetA(src);
72276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    int srcR = SkColorGetR(src);
72376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    int srcG = SkColorGetG(src);
72476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    int srcB = SkColorGetB(src);
725fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
726d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    srcA = SkAlpha255To256(srcA);
727d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
728d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    if (width >= 4) {
729d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        SkASSERT(((size_t)dst & 0x03) == 0);
730d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        while (((size_t)dst & 0x0F) != 0) {
73176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
73276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            mask++;
733d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            dst++;
734d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            width--;
735d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        }
736d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
737d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        __m128i *d = reinterpret_cast<__m128i*>(dst);
73876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        // Set alpha to 0xFF and replicate source four times in SSE register.
73976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
74076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        // Interleave with zeros to get two sets of four 16-bit values.
74176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
74276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        // Set srcA_sse to contain eight copies of srcA, padded with zero.
74376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
74476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        __m128i srcA_sse = _mm_set1_epi16(srcA);
745d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        while (width >= 4) {
74676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // Load four destination pixels into dst_sse.
74776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            __m128i dst_sse = _mm_load_si128(d);
74876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // Load four 16-bit masks into lower half of mask_sse.
74976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            __m128i mask_sse = _mm_loadl_epi64(
75076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                                   reinterpret_cast<const __m128i*>(mask));
75176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org
75276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // Check whether masks are equal to 0 and get the highest bit
75376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // of each byte of result, if masks are all zero, we will get
754d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            // pack_cmp to 0xFFFF
75576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
756d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                                             _mm_setzero_si128()));
757d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
758d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            // if mask pixels are not all zero, we will blend the dst pixels
759d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            if (pack_cmp != 0xFFFF) {
760fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com                // Unpack 4 16bit mask pixels to
76176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
76276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
76376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                mask_sse = _mm_unpacklo_epi16(mask_sse,
76476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                                              _mm_setzero_si128());
765d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
766d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                // Process 4 32bit dst pixels
76776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
76876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                                                   mask_sse, srcA_sse);
769d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                _mm_store_si128(d, result);
770d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            }
771d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
772d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            d++;
77376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            mask += 4;
774d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            width -= 4;
775d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        }
776d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
777d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        dst = reinterpret_cast<SkPMColor*>(d);
778d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    }
779d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
780d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    while (width > 0) {
78176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
78276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        mask++;
783d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        dst++;
784fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        width--;
785d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    }
786d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com}
787d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
78876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgvoid SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
78976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                               SkColor src, int width, SkPMColor opaqueDst) {
790d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    if (width <= 0) {
791d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        return;
792d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    }
793d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
79476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    int srcR = SkColorGetR(src);
79576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    int srcG = SkColorGetG(src);
79676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    int srcB = SkColorGetB(src);
797d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
798d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    if (width >= 4) {
799d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        SkASSERT(((size_t)dst & 0x03) == 0);
800d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        while (((size_t)dst & 0x0F) != 0) {
80176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
80276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            mask++;
803d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            dst++;
804d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            width--;
805d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        }
806d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
807d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        __m128i *d = reinterpret_cast<__m128i*>(dst);
80876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        // Set alpha to 0xFF and replicate source four times in SSE register.
80976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
81076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        // Set srcA_sse to contain eight copies of srcA, padded with zero.
81176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
81276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
813d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        while (width >= 4) {
81476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // Load four destination pixels into dst_sse.
81576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            __m128i dst_sse = _mm_load_si128(d);
81676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // Load four 16-bit masks into lower half of mask_sse.
81776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            __m128i mask_sse = _mm_loadl_epi64(
81876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                                   reinterpret_cast<const __m128i*>(mask));
81976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org
82076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // Check whether masks are equal to 0 and get the highest bit
82176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // of each byte of result, if masks are all zero, we will get
822d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            // pack_cmp to 0xFFFF
82376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
824d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                                             _mm_setzero_si128()));
825d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
826d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            // if mask pixels are not all zero, we will blend the dst pixels
827d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            if (pack_cmp != 0xFFFF) {
828fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com                // Unpack 4 16bit mask pixels to
82976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
83076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
83176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                mask_sse = _mm_unpacklo_epi16(mask_sse,
83276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                                              _mm_setzero_si128());
833d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
834d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                // Process 4 32bit dst pixels
83576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
83676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                                                         mask_sse);
837d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                _mm_store_si128(d, result);
838d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            }
839d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
840d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            d++;
84176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            mask += 4;
842d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            width -= 4;
843d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        }
844d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
845d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        dst = reinterpret_cast<SkPMColor*>(d);
846d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    }
847d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
848d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    while (width > 0) {
84976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
85076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        mask++;
851d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        dst++;
852fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        width--;
853d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    }
854d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com}
855475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
85639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org/* SSE2 version of S32_D565_Opaque()
85739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org * portable version is in core/SkBlitRow_D16.cpp
85839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org */
85939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.orgvoid S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
86039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                          const SkPMColor* SK_RESTRICT src, int count,
86139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                          U8CPU alpha, int /*x*/, int /*y*/) {
86239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org    SkASSERT(255 == alpha);
86339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
86439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org    if (count <= 0) {
86539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        return;
86639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org    }
86739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
86839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org    if (count >= 8) {
86939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        while (((size_t)dst & 0x0F) != 0) {
87039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            SkPMColor c = *src++;
87139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            SkPMColorAssert(c);
87239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
87339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            *dst++ = SkPixel32ToPixel16_ToU16(c);
87439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            count--;
87539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        }
87639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
87739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        const __m128i* s = reinterpret_cast<const __m128i*>(src);
87839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        __m128i* d = reinterpret_cast<__m128i*>(dst);
87939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK);
88039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK);
88139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK);
88239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
88339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        while (count >= 8) {
88439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            // Load 8 pixels of src.
88539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i src_pixel1 = _mm_loadu_si128(s++);
88639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i src_pixel2 = _mm_loadu_si128(s++);
88739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
88839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            // Calculate result r.
88939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i r1 = _mm_srli_epi32(src_pixel1,
89039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                                        SK_R32_SHIFT + (8 - SK_R16_BITS));
89139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            r1 = _mm_and_si128(r1, r16_mask);
89239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i r2 = _mm_srli_epi32(src_pixel2,
89339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                                        SK_R32_SHIFT + (8 - SK_R16_BITS));
89439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            r2 = _mm_and_si128(r2, r16_mask);
89539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i r = _mm_packs_epi32(r1, r2);
89639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
89739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            // Calculate result g.
89839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i g1 = _mm_srli_epi32(src_pixel1,
89939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                                        SK_G32_SHIFT + (8 - SK_G16_BITS));
90039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            g1 = _mm_and_si128(g1, g16_mask);
90139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i g2 = _mm_srli_epi32(src_pixel2,
90239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                                        SK_G32_SHIFT + (8 - SK_G16_BITS));
90339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            g2 = _mm_and_si128(g2, g16_mask);
90439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i g = _mm_packs_epi32(g1, g2);
90539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
90639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            // Calculate result b.
90739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i b1 = _mm_srli_epi32(src_pixel1,
90839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                                        SK_B32_SHIFT + (8 - SK_B16_BITS));
90939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            b1 = _mm_and_si128(b1, b16_mask);
91039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i b2 = _mm_srli_epi32(src_pixel2,
91139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                                        SK_B32_SHIFT + (8 - SK_B16_BITS));
91239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            b2 = _mm_and_si128(b2, b16_mask);
91339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i b = _mm_packs_epi32(b1, b2);
91439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
91539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            // Store 8 16-bit colors in dst.
91639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i d_pixel = SkPackRGB16_SSE(r, g, b);
91739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            _mm_store_si128(d++, d_pixel);
91839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            count -= 8;
91939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        }
92039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
92139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        dst = reinterpret_cast<uint16_t*>(d);
92239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org    }
92339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
92439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org    if (count > 0) {
92539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        do {
92639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            SkPMColor c = *src++;
92739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            SkPMColorAssert(c);
92839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            *dst++ = SkPixel32ToPixel16_ToU16(c);
92939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        } while (--count != 0);
93039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org    }
93139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org}
93239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
933475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org/* SSE2 version of S32A_D565_Opaque()
934475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org * portable version is in core/SkBlitRow_D16.cpp
935475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org */
936475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.orgvoid S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
937475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                           const SkPMColor* SK_RESTRICT src,
938475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                           int count, U8CPU alpha, int /*x*/, int /*y*/) {
939475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org    SkASSERT(255 == alpha);
940475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
941475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org    if (count <= 0) {
942475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        return;
943475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org    }
944475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
945475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org    if (count >= 8) {
946475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        // Make dst 16 bytes alignment
947475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        while (((size_t)dst & 0x0F) != 0) {
948475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            SkPMColor c = *src++;
949475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            if (c) {
950475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org              *dst = SkSrcOver32To16(c, *dst);
951475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            }
952475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            dst += 1;
953475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            count--;
954475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        }
955475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
956475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        const __m128i* s = reinterpret_cast<const __m128i*>(src);
957475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        __m128i* d = reinterpret_cast<__m128i*>(dst);
958475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        __m128i var255 = _mm_set1_epi16(255);
959475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
960475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
961475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
962475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
963475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        while (count >= 8) {
964475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Load 8 pixels of src.
965475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i src_pixel1 = _mm_loadu_si128(s++);
966475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i src_pixel2 = _mm_loadu_si128(s++);
967475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
968475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Check whether src pixels are equal to 0 and get the highest bit
969475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // of each byte of result, if src pixels are all zero, src_cmp1 and
970475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // src_cmp2 will be 0xFFFF.
971475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
972475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                                             _mm_setzero_si128()));
973475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
974475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                                             _mm_setzero_si128()));
975475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
976475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                d++;
977475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                count -= 8;
978475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                continue;
979475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            }
980475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
981475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Load 8 pixels of dst.
982475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
983475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
984475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Extract A from src.
985475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT));
986475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sa1 = _mm_srli_epi32(sa1, 24);
987475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT));
988475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sa2 = _mm_srli_epi32(sa2, 24);
989475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sa = _mm_packs_epi32(sa1, sa2);
990475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
991475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Extract R from src.
992475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sr1 = _mm_slli_epi32(src_pixel1,(24 - SK_R32_SHIFT));
993475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sr1 = _mm_srli_epi32(sr1, 24);
994475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sr2 = _mm_slli_epi32(src_pixel2,(24 - SK_R32_SHIFT));
995475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sr2 = _mm_srli_epi32(sr2, 24);
996475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sr = _mm_packs_epi32(sr1, sr2);
997475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
998475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Extract G from src.
999475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sg1 = _mm_slli_epi32(src_pixel1,(24 - SK_G32_SHIFT));
1000475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sg1 = _mm_srli_epi32(sg1, 24);
1001475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sg2 = _mm_slli_epi32(src_pixel2,(24 - SK_G32_SHIFT));
1002475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sg2 = _mm_srli_epi32(sg2, 24);
1003475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sg = _mm_packs_epi32(sg1, sg2);
1004475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1005475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Extract B from src.
1006475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sb1 = _mm_slli_epi32(src_pixel1,(24 - SK_B32_SHIFT));
1007475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sb1 = _mm_srli_epi32(sb1, 24);
1008475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sb2 = _mm_slli_epi32(src_pixel2,(24 - SK_B32_SHIFT));
1009475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sb2 = _mm_srli_epi32(sb2, 24);
1010475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sb = _mm_packs_epi32(sb1, sb2);
1011475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1012475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Extract R G B from dst.
1013475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i dr = _mm_srli_epi16(dst_pixel,SK_R16_SHIFT);
1014475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            dr = _mm_and_si128(dr, r16_mask);
1015475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i dg = _mm_srli_epi16(dst_pixel,SK_G16_SHIFT);
1016475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            dg = _mm_and_si128(dg, g16_mask);
1017475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i db = _mm_srli_epi16(dst_pixel,SK_B16_SHIFT);
1018475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            db = _mm_and_si128(db, b16_mask);
1019475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1020475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
1021475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1022475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Calculate R G B of result.
1023475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Original algorithm is in SkSrcOver32To16().
1024475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE(dr, isa, SK_R16_BITS));
1025475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
1026475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE(dg, isa, SK_G16_BITS));
1027475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
1028475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE(db, isa, SK_B16_BITS));
1029475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
1030475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1031475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Pack R G B into 16-bit color.
1032475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db);
1033475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1034475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Store 8 16-bit colors in dst.
1035475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            _mm_store_si128(d++, d_pixel);
1036475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            count -= 8;
1037475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        }
1038475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1039475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
1040475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        dst = reinterpret_cast<uint16_t*>(d);
1041475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org    }
1042475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1043475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org    if (count > 0) {
1044475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        do {
1045475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            SkPMColor c = *src++;
1046475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            SkPMColorAssert(c);
1047475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            if (c) {
1048475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                *dst = SkSrcOver32To16(c, *dst);
1049475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            }
1050475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            dst += 1;
1051475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        } while (--count != 0);
1052475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org    }
1053475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org}
1054