1f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org/*
23838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com * Copyright 2012 The Android Open Source Project
3685cfc0ee13d7c355ae2f4f3d225ad45e945763fepoger@google.com *
4685cfc0ee13d7c355ae2f4f3d225ad45e945763fepoger@google.com * Use of this source code is governed by a BSD-style license that can be
5685cfc0ee13d7c355ae2f4f3d225ad45e945763fepoger@google.com * found in the LICENSE file.
6f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org */
7f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org
8685cfc0ee13d7c355ae2f4f3d225ad45e945763fepoger@google.com
997c06343a254f88ccfcdaaee157e40f3340320a6senorblanco@chromium.org#include "SkBlitRow_opts_SSE2.h"
10e642bdf5215599ce5d4a6f43502b396c3b9aef36caryclark@google.com#include "SkBitmapProcState_opts_SSE2.h"
11f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org#include "SkColorPriv.h"
121b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org#include "SkUtils.h"
13f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org
14f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org#include <emmintrin.h>
15f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org
16f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org/* SSE2 version of S32_Blend_BlitRow32()
17f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org * portable version is in core/SkBlitRow_D32.cpp
18f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org */
1997c06343a254f88ccfcdaaee157e40f3340320a6senorblanco@chromium.orgvoid S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
2097c06343a254f88ccfcdaaee157e40f3340320a6senorblanco@chromium.org                              const SkPMColor* SK_RESTRICT src,
2197c06343a254f88ccfcdaaee157e40f3340320a6senorblanco@chromium.org                              int count, U8CPU alpha) {
22f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org    SkASSERT(alpha <= 255);
23f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org    if (count <= 0) {
24f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org        return;
25f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org    }
26f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org
27f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org    uint32_t src_scale = SkAlpha255To256(alpha);
28f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org    uint32_t dst_scale = 256 - src_scale;
29f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org
301d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org    if (count >= 4) {
311d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        SkASSERT(((size_t)dst & 0x03) == 0);
321d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        while (((size_t)dst & 0x0F) != 0) {
331d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
341d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            src++;
351d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst++;
361d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            count--;
371d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        }
381d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
391d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        const __m128i *s = reinterpret_cast<const __m128i*>(src);
401d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        __m128i *d = reinterpret_cast<__m128i*>(dst);
411d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
423838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com        __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
433838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com
443838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com        // Move scale factors to upper byte of word
453838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com        __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
463838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com        __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
471d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        while (count >= 4) {
481d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Load 4 pixels each of src and dest.
491d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
501d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
511d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
523838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // Interleave Atom port 0/1 operations based on the execution port
533838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // constraints that multiply can only be executed on port 0 (while
543838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // boolean operations can be executed on either port 0 or port 1)
553838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // because GCC currently doesn't do a good job scheduling
563838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // instructions based on these constraints.
573838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com
581d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Get red and blue pixels into lower byte of each word.
593838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
601d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
611d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
623838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // Multiply by scale.
633838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // (4 x (0, rs.h, 0, bs.h))
643838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // where rs.h stands for the higher byte of r * scale, and
653838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // bs.h the higher byte of b * scale.
663838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
673838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com
683838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // Get alpha and green pixels into higher byte of each word.
693838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
703838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
711d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
721d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Multiply by scale.
733838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // (4 x (as.h, as.l, gs.h, gs.l))
743838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
751d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
763838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // Clear the lower byte of the a*scale and g*scale results
773838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // (4 x (as.h, 0, gs.h, 0))
783838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            src_ag = _mm_and_si128(src_ag, ag_mask);
793838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com
803838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // Operations the destination pixels are the same as on the
813838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // source pixels. See the comments above.
823838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
833838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
843838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
853838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
863838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            dst_ag = _mm_and_si128(dst_ag, ag_mask);
871d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
881d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Combine back into RGBA.
893838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // (4 x (as.h, rs.h, gs.h, bs.h))
901d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            src_pixel = _mm_or_si128(src_rb, src_ag);
911d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
921d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
931d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Add result
941d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
951d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            _mm_store_si128(d, result);
961d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            s++;
971d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            d++;
981d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            count -= 4;
991d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        }
1001d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
1011d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        dst = reinterpret_cast<SkPMColor*>(d);
102f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org    }
103f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org
10497c06343a254f88ccfcdaaee157e40f3340320a6senorblanco@chromium.org    while (count > 0) {
105f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org        *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
106f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org        src++;
107f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org        dst++;
108f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org        count--;
109f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org    }
110f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org}
111f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org
11297c06343a254f88ccfcdaaee157e40f3340320a6senorblanco@chromium.orgvoid S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
11397c06343a254f88ccfcdaaee157e40f3340320a6senorblanco@chromium.org                                const SkPMColor* SK_RESTRICT src,
11497c06343a254f88ccfcdaaee157e40f3340320a6senorblanco@chromium.org                                int count, U8CPU alpha) {
115f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org    SkASSERT(alpha == 255);
116f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org    if (count <= 0) {
117f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org        return;
118f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org    }
1191d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
1201d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org    if (count >= 4) {
1211d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        SkASSERT(((size_t)dst & 0x03) == 0);
1221d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        while (((size_t)dst & 0x0F) != 0) {
1231d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            *dst = SkPMSrcOver(*src, *dst);
1241d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            src++;
1251d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst++;
1261d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            count--;
1271d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        }
1281d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
1291d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        const __m128i *s = reinterpret_cast<const __m128i*>(src);
1301d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        __m128i *d = reinterpret_cast<__m128i*>(dst);
131f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org#ifdef SK_USE_ACCURATE_BLENDING
1321d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
1331d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
1341d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
1351d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        while (count >= 4) {
1361d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Load 4 pixels
1371d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
1381d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
1391d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
1401d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
141444278bc14b33f80ab7e00431820a34f972776c1senorblanco@chromium.org            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
1421d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Shift alphas down to lower 8 bits of each quad.
1431d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            __m128i alpha = _mm_srli_epi32(src_pixel, 24);
1441d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
1451d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Copy alpha to upper 3rd byte of each quad
1461d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
1471d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
1481d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Subtract alphas from 255, to get 0..255
1491d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            alpha = _mm_sub_epi16(c_255, alpha);
1501d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
1511d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Multiply by red and blue by src alpha.
1521d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
1531d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Multiply by alpha and green by src alpha.
1541d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
1551d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
1561d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // dst_rb_low = (dst_rb >> 8)
1571d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
1581d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
1591d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
1601d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
1611d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
1621d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst_rb = _mm_add_epi16(dst_rb, c_128);
1631d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst_rb = _mm_srli_epi16(dst_rb, 8);
1641d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
1651d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
1661d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
1671d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst_ag = _mm_add_epi16(dst_ag, c_128);
1681d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
1691d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
1701d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Combine back into RGBA.
1711d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
1721d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
1731d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Add result
1741d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
1751d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            _mm_store_si128(d, result);
1761d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            s++;
1771d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            d++;
1781d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            count -= 4;
1791d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        }
1801d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org    #else
1811d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
1821d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
1831d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        while (count >= 4) {
1841d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Load 4 pixels
1851d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
1861d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
1871d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
1881d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
189444278bc14b33f80ab7e00431820a34f972776c1senorblanco@chromium.org            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
1901d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
191444278bc14b33f80ab7e00431820a34f972776c1senorblanco@chromium.org            // (a0, g0, a1, g1, a2, g2, a3, g3)  (low byte of each word)
192444278bc14b33f80ab7e00431820a34f972776c1senorblanco@chromium.org            __m128i alpha = _mm_srli_epi16(src_pixel, 8);
193444278bc14b33f80ab7e00431820a34f972776c1senorblanco@chromium.org
194444278bc14b33f80ab7e00431820a34f972776c1senorblanco@chromium.org            // (a0, a0, a1, a1, a2, g2, a3, g3)
195444278bc14b33f80ab7e00431820a34f972776c1senorblanco@chromium.org            alpha = _mm_shufflehi_epi16(alpha, 0xF5);
196444278bc14b33f80ab7e00431820a34f972776c1senorblanco@chromium.org
197444278bc14b33f80ab7e00431820a34f972776c1senorblanco@chromium.org            // (a0, a0, a1, a1, a2, a2, a3, a3)
198444278bc14b33f80ab7e00431820a34f972776c1senorblanco@chromium.org            alpha = _mm_shufflelo_epi16(alpha, 0xF5);
1991d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
2001d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Subtract alphas from 256, to get 1..256
2011d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            alpha = _mm_sub_epi16(c_256, alpha);
2021d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
2031d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Multiply by red and blue by src alpha.
2041d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
2051d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Multiply by alpha and green by src alpha.
2061d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
2071d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
2081d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Divide by 256.
2091d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst_rb = _mm_srli_epi16(dst_rb, 8);
2101d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
2111d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Mask out high bits (already in the right place)
2121d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
2131d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
2141d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Combine back into RGBA.
2151d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
2161d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
2171d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Add result
2181d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
2191d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            _mm_store_si128(d, result);
2201d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            s++;
2211d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            d++;
2221d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            count -= 4;
2231d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        }
224f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org#endif
2251d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
2261d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        dst = reinterpret_cast<SkPMColor*>(d);
2271d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org    }
228f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org
229f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org    while (count > 0) {
230f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org        *dst = SkPMSrcOver(*src, *dst);
231f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org        src++;
232f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org        dst++;
233f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org        count--;
234f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org    }
235f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org}
236f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org
23797c06343a254f88ccfcdaaee157e40f3340320a6senorblanco@chromium.orgvoid S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
23897c06343a254f88ccfcdaaee157e40f3340320a6senorblanco@chromium.org                               const SkPMColor* SK_RESTRICT src,
23997c06343a254f88ccfcdaaee157e40f3340320a6senorblanco@chromium.org                               int count, U8CPU alpha) {
240f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org    SkASSERT(alpha <= 255);
241f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org    if (count <= 0) {
242f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org        return;
243f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org    }
244f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org
2451d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org    if (count >= 4) {
2461d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        while (((size_t)dst & 0x0F) != 0) {
2471d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            *dst = SkBlendARGB32(*src, *dst, alpha);
2481d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            src++;
2491d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst++;
2501d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            count--;
2511d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        }
2521d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
2531d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        uint32_t src_scale = SkAlpha255To256(alpha);
2541d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
2551d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        const __m128i *s = reinterpret_cast<const __m128i*>(src);
2561d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        __m128i *d = reinterpret_cast<__m128i*>(dst);
2573838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com        __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
2581d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
2591d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
2601d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        while (count >= 4) {
2611d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Load 4 pixels each of src and dest.
2621d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
2631d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
2641d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
2651d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Get red and blue pixels into lower byte of each word.
2661d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
2671d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
2681d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
2691d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Get alpha and green into lower byte of each word.
2701d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
2711d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
2721d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
2731d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Put per-pixel alpha in low byte of each word.
2743838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // After the following two statements, the dst_alpha looks like
2753838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
2761d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
2771d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
2781d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
2791d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // dst_alpha = dst_alpha * src_scale
2803838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // Because src_scales are in the higher byte of each word and
2813838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // we use mulhi here, the resulting alpha values are already
2823838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // in the right place and don't need to be divided by 256.
2833838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
2843838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
2851d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
2861d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Subtract alphas from 256, to get 1..256
2871d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
2881d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
2891d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Multiply red and blue by dst pixel alpha.
2901d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
2911d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Multiply alpha and green by dst pixel alpha.
2921d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
2931d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
2941d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Multiply red and blue by global alpha.
2953838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // (4 x (0, rs.h, 0, bs.h))
2963838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // where rs.h stands for the higher byte of r * src_scale,
2973838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // and bs.h the higher byte of b * src_scale.
2983838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // Again, because we use mulhi, the resuling red and blue
2993838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // values are already in the right place and don't need to
3003838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // be divided by 256.
3013838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
3021d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Multiply alpha and green by global alpha.
3033838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // (4 x (0, as.h, 0, gs.h))
3043838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
3051d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
3061d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Divide by 256.
3071d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst_rb = _mm_srli_epi16(dst_rb, 8);
3081d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
3091d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Mask out low bits (goodies already in the right place; no need to divide)
3101d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
3113838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // Shift alpha and green to higher byte of each word.
3123838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            // (4 x (as.h, 0, gs.h, 0))
3133838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com            src_ag = _mm_slli_epi16(src_ag, 8);
3141d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
3151d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Combine back into RGBA.
3161d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
3171d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            src_pixel = _mm_or_si128(src_rb, src_ag);
3181d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
3191d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            // Add two pixels into result.
3201d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
3211d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            _mm_store_si128(d, result);
3221d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            s++;
3231d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            d++;
3241d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org            count -= 4;
3251d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        }
3261d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
3271d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org        dst = reinterpret_cast<SkPMColor*>(d);
328f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org    }
3291d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org
330f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org    while (count > 0) {
331f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org        *dst = SkBlendARGB32(*src, *dst, alpha);
332f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org        src++;
333f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org        dst++;
334f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org        count--;
335f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org    }
336f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org}
3371b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org
3381b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org/* SSE2 version of Color32()
3391b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org * portable version is in core/SkBlitRow_D32.cpp
3401b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org */
3411b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.orgvoid Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
3421b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                  SkPMColor color) {
3431b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org
3441b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org    if (count <= 0) {
3451b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org        return;
3461b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org    }
3471b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org
3481b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org    if (0 == color) {
3491b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org        if (src != dst) {
3501b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org            memcpy(dst, src, count * sizeof(SkPMColor));
3511b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org        }
352739ecb5e1a57a9e1ce065f892fe2c73209617e2breed@google.com        return;
3531b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org    }
3541b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org
3551b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org    unsigned colorA = SkGetPackedA32(color);
3561b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org    if (255 == colorA) {
3571b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org        sk_memset32(dst, color, count);
3581b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org    } else {
3591b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org        unsigned scale = 256 - SkAlpha255To256(colorA);
3601b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org
3611b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org        if (count >= 4) {
3621b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org            SkASSERT(((size_t)dst & 0x03) == 0);
3631b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org            while (((size_t)dst & 0x0F) != 0) {
3641b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                *dst = color + SkAlphaMulQ(*src, scale);
3651b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                src++;
3661b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                dst++;
3671b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                count--;
3681b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org            }
3691b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org
3701b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org            const __m128i *s = reinterpret_cast<const __m128i*>(src);
3711b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org            __m128i *d = reinterpret_cast<__m128i*>(dst);
3721b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org            __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
3731b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org            __m128i src_scale_wide = _mm_set1_epi16(scale);
3741b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org            __m128i color_wide = _mm_set1_epi32(color);
3751b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org            while (count >= 4) {
3761b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                // Load 4 pixels each of src and dest.
3771b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                __m128i src_pixel = _mm_loadu_si128(s);
3781b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org
3791b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                // Get red and blue pixels into lower byte of each word.
3801b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
3815fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com
3821b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                // Get alpha and green into lower byte of each word.
3831b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
3841b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org
3851b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                // Multiply by scale.
3861b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
3871b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
3881b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org
3891b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                // Divide by 256.
3901b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                src_rb = _mm_srli_epi16(src_rb, 8);
3911b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                src_ag = _mm_andnot_si128(rb_mask, src_ag);
3921b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org
3931b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                // Combine back into RGBA.
3941b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                src_pixel = _mm_or_si128(src_rb, src_ag);
3951b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org
3961b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                // Add color to result.
3971b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                __m128i result = _mm_add_epi8(color_wide, src_pixel);
3981b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org
3991b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                // Store result.
4001b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                _mm_store_si128(d, result);
4011b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                s++;
4021b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                d++;
4031b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org                count -= 4;
4041b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org            }
4051b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org            src = reinterpret_cast<const SkPMColor*>(s);
4061b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org            dst = reinterpret_cast<SkPMColor*>(d);
4071b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org         }
4081b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org
4091b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org        while (count > 0) {
4101b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org            *dst = color + SkAlphaMulQ(*src, scale);
4111b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org            src += 1;
4121b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org            dst += 1;
4131b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org            count--;
4145fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com        }
4151b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org    }
4161b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org}
4175fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com
4189bed9d7780715fe1c6aae1df1ceee692a8c365a2reed@google.comvoid SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
4199bed9d7780715fe1c6aae1df1ceee692a8c365a2reed@google.com                               size_t maskRB, SkColor origColor,
42040d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com                               int width, int height) {
421def3e927ac43495ad443f186a1717cdfd15d1630reed@google.com    SkPMColor color = SkPreMultiplyColor(origColor);
4225fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com    size_t dstOffset = dstRB - (width << 2);
4235fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com    size_t maskOffset = maskRB - width;
4245fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com    SkPMColor* dst = (SkPMColor *)device;
4259bed9d7780715fe1c6aae1df1ceee692a8c365a2reed@google.com    const uint8_t* mask = (const uint8_t*)maskPtr;
4265fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com    do {
4275fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com        int count = width;
4285fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com        if (count >= 4) {
4295fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com            while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
4305fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                *dst = SkBlendARGB32(color, *dst, *mask);
4315fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                mask++;
4325fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                dst++;
4335fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                count--;
4345fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com            }
4355fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com            __m128i *d = reinterpret_cast<__m128i*>(dst);
4365fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com            __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
4375fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com            __m128i c_256 = _mm_set1_epi16(256);
4385fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com            __m128i c_1 = _mm_set1_epi16(1);
4395fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com            __m128i src_pixel = _mm_set1_epi32(color);
4405fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com            while (count >= 4) {
4415fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                // Load 4 pixels each of src and dest.
4425fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                __m128i dst_pixel = _mm_load_si128(d);
4435fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com
4445fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                //set the aphla value
4455fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                __m128i src_scale_wide =  _mm_set_epi8(0, *(mask+3),\
4465fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                                0, *(mask+3),0, \
4475fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                                *(mask+2),0, *(mask+2),\
4485fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                                0,*(mask+1), 0,*(mask+1),\
4495fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                                0, *mask,0,*mask);
4505fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com
4515fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                //call SkAlpha255To256()
4525fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
4535fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com
4545fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                // Get red and blue pixels into lower byte of each word.
4555fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
4565fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
4575fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com
4585fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                // Get alpha and green into lower byte of each word.
4595fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
4605fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
4615fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com
4625fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                // Put per-pixel alpha in low byte of each word.
4635fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
4645fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
4655fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com
4665fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                // dst_alpha = dst_alpha * src_scale
4675fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
4685fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com
4695fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                // Divide by 256.
4705fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                dst_alpha = _mm_srli_epi16(dst_alpha, 8);
4715fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com
4725fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                // Subtract alphas from 256, to get 1..256
4735fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
4745fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                // Multiply red and blue by dst pixel alpha.
4755fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
4765fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                // Multiply alpha and green by dst pixel alpha.
4775fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
4785fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com
4795fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                // Multiply red and blue by global alpha.
4805fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
4815fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                // Multiply alpha and green by global alpha.
4825fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
4835fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                // Divide by 256.
4845fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                dst_rb = _mm_srli_epi16(dst_rb, 8);
4855fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                src_rb = _mm_srli_epi16(src_rb, 8);
4865fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com
4875fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                // Mask out low bits (goodies already in the right place; no need to divide)
4885fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
4895fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                src_ag = _mm_andnot_si128(rb_mask, src_ag);
4905fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com
4915fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                // Combine back into RGBA.
4925fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                dst_pixel = _mm_or_si128(dst_rb, dst_ag);
4935fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
4945fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com
4955fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                // Add two pixels into result.
4965fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
4975fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                _mm_store_si128(d, result);
4985fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                // load the next 4 pixel
4995fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                mask = mask + 4;
5005fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                d++;
5015fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com                count -= 4;
5025fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com            }
5035fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com            dst = reinterpret_cast<SkPMColor *>(d);
5045fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com        }
5055fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com        while(count > 0) {
5065fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com            *dst= SkBlendARGB32(color, *dst, *mask);
5075fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com            dst += 1;
5085fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com            mask++;
5095fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com            count --;
5105fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com        }
5115fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com        dst = (SkPMColor *)((char*)dst + dstOffset);
5125fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com        mask += maskOffset;
5135fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com    } while (--height != 0);
5145fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com}
51540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
516bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com// The following (left) shifts cause the top 5 bits of the mask components to
517bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com// line up with the corresponding components in an SkPMColor.
518bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com// Note that the mask's RGB16 order may differ from the SkPMColor order.
519bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
520bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
521bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
522bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com
523bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#if SK_R16x5_R32x5_SHIFT == 0
524bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
525bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#elif SK_R16x5_R32x5_SHIFT > 0
526bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
527bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#else
528bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
529bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#endif
530bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com
531bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#if SK_G16x5_G32x5_SHIFT == 0
532bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
533bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#elif SK_G16x5_G32x5_SHIFT > 0
534bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
535bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#else
536bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
537bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#endif
538bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com
539bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#if SK_B16x5_B32x5_SHIFT == 0
540bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
541bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#elif SK_B16x5_B32x5_SHIFT > 0
542bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
543bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#else
544bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
545bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#endif
546bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com
54743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.orgstatic __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
54843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org                                 __m128i &mask, __m128i &srcA) {
54943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // In the following comments, the components of src, dst and mask are
55043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
55143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // by an R, G, B, or A suffix. Components of one of the four pixels that
55243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
55343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // example is the blue channel of the second destination pixel. Memory
55443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // layout is shown for an ARGB byte order in a color value.
55543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org
55643fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // src and srcA store 8-bit values interleaved with zeros.
55743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
55843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
55943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
56043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // mask stores 16-bit values (compressed three channels) interleaved with zeros.
56143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
56243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
56343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
56443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org
56540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
56643fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
567bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
568bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
569bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com
57043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
571bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
572bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
573935e9f4fafdfc64130e6be9ea2bb30e3bafd852armistry@google.com
57443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
575bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
576bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
577935e9f4fafdfc64130e6be9ea2bb30e3bafd852armistry@google.com
57840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
57943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
58043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // 8-bit position
58143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
58243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
58340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    mask = _mm_or_si128(_mm_or_si128(r, g), b);
58440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
585935e9f4fafdfc64130e6be9ea2bb30e3bafd852armistry@google.com    // Interleave R,G,B into the lower byte of word.
58643fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // i.e. split the sixteen 8-bit values from mask into two sets of eight
58743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // 16-bit values, padded by zero.
58840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    __m128i maskLo, maskHi;
58943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
59040d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
59143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
59240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
59340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
59443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // Upscale from 0..31 to 0..32
59543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // (allows to replace division by left-shift further down)
59643fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // Left-shift each component by 4 and add the result back to that component,
59743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
59840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
59940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
60040d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
60143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // Multiply each component of maskLo and maskHi by srcA
60243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    maskLo = _mm_mullo_epi16(maskLo, srcA);
60343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    maskHi = _mm_mullo_epi16(maskHi, srcA);
60440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
60543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // Left shift mask components by 8 (divide by 256)
60640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    maskLo = _mm_srli_epi16(maskLo, 8);
60740d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    maskHi = _mm_srli_epi16(maskHi, 8);
60840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
60943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // Interleave R,G,B into the lower byte of the word
61043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
61140d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
61243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
61340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
61440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
61543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // mask = (src - dst) * mask
61643fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
61743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
61840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
61943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // mask = (src - dst) * mask >> 5
62040d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    maskLo = _mm_srai_epi16(maskLo, 5);
62140d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    maskHi = _mm_srai_epi16(maskHi, 5);
62240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
62340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    // Add two pixels into result.
62443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // result = dst + ((src - dst) * mask >> 5)
62540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
62640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
62740d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
62843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // Pack into 4 32bit dst pixels.
62943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
63043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
63143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // clamping to 255 if necessary.
63240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    return _mm_packus_epi16(resultLo, resultHi);
63340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com}
63440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
63543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.orgstatic __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
63640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com                                       __m128i &mask) {
63743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // In the following comments, the components of src, dst and mask are
63843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
63943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // by an R, G, B, or A suffix. Components of one of the four pixels that
64043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
64143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // example is the blue channel of the second destination pixel. Memory
64243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // layout is shown for an ARGB byte order in a color value.
64343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org
64443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // src and srcA store 8-bit values interleaved with zeros.
64543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
64643fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // mask stores 16-bit values (shown as high and low bytes) interleaved with
64743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // zeros
64843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
64943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
65043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org
65140d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
65243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
653bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
654bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
65540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
65643fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
657bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
658bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
659935e9f4fafdfc64130e6be9ea2bb30e3bafd852armistry@google.com
66043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
661bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
662bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
66340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
66440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
66543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
66643fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // 8-bit position
66743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
66843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
66940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    mask = _mm_or_si128(_mm_or_si128(r, g), b);
67040d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
671935e9f4fafdfc64130e6be9ea2bb30e3bafd852armistry@google.com    // Interleave R,G,B into the lower byte of word.
67243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // i.e. split the sixteen 8-bit values from mask into two sets of eight
67343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // 16-bit values, padded by zero.
67440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    __m128i maskLo, maskHi;
67543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
67640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
67743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
67840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
67940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
68043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // Upscale from 0..31 to 0..32
68143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // (allows to replace division by left-shift further down)
68243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // Left-shift each component by 4 and add the result back to that component,
68343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
68440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
68540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
68640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
68743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // Interleave R,G,B into the lower byte of the word
68843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
68940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
69043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
69140d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
69240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
69343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // mask = (src - dst) * mask
69443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
69543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
69640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
69743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // mask = (src - dst) * mask >> 5
69840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    maskLo = _mm_srai_epi16(maskLo, 5);
69940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    maskHi = _mm_srai_epi16(maskHi, 5);
70040d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
70140d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    // Add two pixels into result.
70243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // result = dst + ((src - dst) * mask >> 5)
70340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
70440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
70540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
706b2229f21294a4a185be6bba86395f088f3100aa0bungeman@google.com    // Pack into 4 32bit dst pixels and force opaque.
70743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
70843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
70943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    // clamping to 255 if necessary. Set alpha components to 0xFF.
710b2229f21294a4a185be6bba86395f088f3100aa0bungeman@google.com    return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
711b2229f21294a4a185be6bba86395f088f3100aa0bungeman@google.com                        _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
71240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com}
71340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
71443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.orgvoid SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
71543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org                         SkColor src, int width, SkPMColor) {
71640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    if (width <= 0) {
71740d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com        return;
71840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    }
71940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
72043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    int srcA = SkColorGetA(src);
72143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    int srcR = SkColorGetR(src);
72243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    int srcG = SkColorGetG(src);
72343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    int srcB = SkColorGetB(src);
724935e9f4fafdfc64130e6be9ea2bb30e3bafd852armistry@google.com
72540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    srcA = SkAlpha255To256(srcA);
72640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
72740d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    if (width >= 4) {
72840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com        SkASSERT(((size_t)dst & 0x03) == 0);
72940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com        while (((size_t)dst & 0x0F) != 0) {
73043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org            *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
73143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org            mask++;
73240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com            dst++;
73340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com            width--;
73440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com        }
73540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
73640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com        __m128i *d = reinterpret_cast<__m128i*>(dst);
73743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org        // Set alpha to 0xFF and replicate source four times in SSE register.
73843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
73943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org        // Interleave with zeros to get two sets of four 16-bit values.
74043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
74143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org        // Set srcA_sse to contain eight copies of srcA, padded with zero.
74243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
74343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org        __m128i srcA_sse = _mm_set1_epi16(srcA);
74440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com        while (width >= 4) {
74543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org            // Load four destination pixels into dst_sse.
74643fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org            __m128i dst_sse = _mm_load_si128(d);
74743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org            // Load four 16-bit masks into lower half of mask_sse.
74843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org            __m128i mask_sse = _mm_loadl_epi64(
74943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org                                   reinterpret_cast<const __m128i*>(mask));
75043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org
75143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org            // Check whether masks are equal to 0 and get the highest bit
75243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org            // of each byte of result, if masks are all zero, we will get
75340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com            // pack_cmp to 0xFFFF
75443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
75540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com                                             _mm_setzero_si128()));
75640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
75740d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com            // if mask pixels are not all zero, we will blend the dst pixels
75840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com            if (pack_cmp != 0xFFFF) {
759935e9f4fafdfc64130e6be9ea2bb30e3bafd852armistry@google.com                // Unpack 4 16bit mask pixels to
76043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
76143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
76243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org                mask_sse = _mm_unpacklo_epi16(mask_sse,
76343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org                                              _mm_setzero_si128());
76440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
76540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com                // Process 4 32bit dst pixels
76643fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org                __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
76743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org                                                   mask_sse, srcA_sse);
76840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com                _mm_store_si128(d, result);
76940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com            }
77040d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
77140d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com            d++;
77243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org            mask += 4;
77340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com            width -= 4;
77440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com        }
77540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
77640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com        dst = reinterpret_cast<SkPMColor*>(d);
77740d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    }
77840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
77940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    while (width > 0) {
78043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org        *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
78143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org        mask++;
78240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com        dst++;
783935e9f4fafdfc64130e6be9ea2bb30e3bafd852armistry@google.com        width--;
78440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    }
78540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com}
78640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
78743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.orgvoid SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
78843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org                               SkColor src, int width, SkPMColor opaqueDst) {
78940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    if (width <= 0) {
79040d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com        return;
79140d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    }
79240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
79343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    int srcR = SkColorGetR(src);
79443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    int srcG = SkColorGetG(src);
79543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org    int srcB = SkColorGetB(src);
79640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
79740d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    if (width >= 4) {
79840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com        SkASSERT(((size_t)dst & 0x03) == 0);
79940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com        while (((size_t)dst & 0x0F) != 0) {
80043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org            *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
80143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org            mask++;
80240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com            dst++;
80340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com            width--;
80440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com        }
80540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
80640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com        __m128i *d = reinterpret_cast<__m128i*>(dst);
80743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org        // Set alpha to 0xFF and replicate source four times in SSE register.
80843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
80943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org        // Set srcA_sse to contain eight copies of srcA, padded with zero.
81043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
81143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
81240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com        while (width >= 4) {
81343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org            // Load four destination pixels into dst_sse.
81443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org            __m128i dst_sse = _mm_load_si128(d);
81543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org            // Load four 16-bit masks into lower half of mask_sse.
81643fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org            __m128i mask_sse = _mm_loadl_epi64(
81743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org                                   reinterpret_cast<const __m128i*>(mask));
81843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org
81943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org            // Check whether masks are equal to 0 and get the highest bit
82043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org            // of each byte of result, if masks are all zero, we will get
82140d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com            // pack_cmp to 0xFFFF
82243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
82340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com                                             _mm_setzero_si128()));
82440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
82540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com            // if mask pixels are not all zero, we will blend the dst pixels
82640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com            if (pack_cmp != 0xFFFF) {
827935e9f4fafdfc64130e6be9ea2bb30e3bafd852armistry@google.com                // Unpack 4 16bit mask pixels to
82843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
82943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
83043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org                mask_sse = _mm_unpacklo_epi16(mask_sse,
83143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org                                              _mm_setzero_si128());
83240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
83340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com                // Process 4 32bit dst pixels
83443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org                __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
83543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org                                                         mask_sse);
83640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com                _mm_store_si128(d, result);
83740d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com            }
83840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
83940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com            d++;
84043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org            mask += 4;
84140d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com            width -= 4;
84240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com        }
84340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
84440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com        dst = reinterpret_cast<SkPMColor*>(d);
84540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    }
84640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com
84740d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    while (width > 0) {
84843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org        *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
84943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org        mask++;
85040d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com        dst++;
851935e9f4fafdfc64130e6be9ea2bb30e3bafd852armistry@google.com        width--;
85240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com    }
85340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com}
854