SkBlitRow_opts_SSE2.cpp revision edb606cb999887d54629f361bcbf57c5fede1bb0
1ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com
29272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org/*
3ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * Copyright 2009 The Android Open Source Project
4ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com *
5ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * Use of this source code is governed by a BSD-style license that can be
6ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * found in the LICENSE file.
79272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org */
89272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
9ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com
104e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org#include "SkBlitRow_opts_SSE2.h"
119272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#include "SkColorPriv.h"
12c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org#include "SkUtils.h"
139272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
149272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#include <emmintrin.h>
159272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
169272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org/* SSE2 version of S32_Blend_BlitRow32()
179272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org * portable version is in core/SkBlitRow_D32.cpp
189272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org */
194e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
204e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                              const SkPMColor* SK_RESTRICT src,
214e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                              int count, U8CPU alpha) {
229272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    SkASSERT(alpha <= 255);
239272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    if (count <= 0) {
249272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        return;
259272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
269272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
279272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    uint32_t src_scale = SkAlpha255To256(alpha);
289272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    uint32_t dst_scale = 256 - src_scale;
299272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
30dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    if (count >= 4) {
31dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        SkASSERT(((size_t)dst & 0x03) == 0);
32dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (((size_t)dst & 0x0F) != 0) {
33dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
34dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src++;
35dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst++;
36dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count--;
37dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
38dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
39dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        const __m128i *s = reinterpret_cast<const __m128i*>(src);
40dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i *d = reinterpret_cast<__m128i*>(dst);
41dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
42dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i src_scale_wide = _mm_set1_epi16(src_scale);
43dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i dst_scale_wide = _mm_set1_epi16(dst_scale);
44dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (count >= 4) {
45dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Load 4 pixels each of src and dest.
46dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
47dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
48dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
49dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Get red and blue pixels into lower byte of each word.
50dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
51dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
52dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
53dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Get alpha and green into lower byte of each word.
54dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
55dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
56dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
57dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by scale.
58dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
59dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
60dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_mullo_epi16(dst_rb, dst_scale_wide);
61dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_mullo_epi16(dst_ag, dst_scale_wide);
62dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
63dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Divide by 256.
64dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_rb = _mm_srli_epi16(src_rb, 8);
65dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_srli_epi16(dst_rb, 8);
66dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_ag = _mm_andnot_si128(rb_mask, src_ag);
67dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
68dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
69dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Combine back into RGBA.
70dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_pixel = _mm_or_si128(src_rb, src_ag);
71dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
72dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
73dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Add result
74dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
75dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            _mm_store_si128(d, result);
76dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            s++;
77dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            d++;
78dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count -= 4;
79dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
80dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
81dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        dst = reinterpret_cast<SkPMColor*>(d);
829272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
839272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
844e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org    while (count > 0) {
859272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
869272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        src++;
879272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        dst++;
889272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        count--;
899272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
909272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org}
919272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
924e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
934e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                                const SkPMColor* SK_RESTRICT src,
944e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                                int count, U8CPU alpha) {
959272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    SkASSERT(alpha == 255);
969272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    if (count <= 0) {
979272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        return;
989272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
99dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
100dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    if (count >= 4) {
101dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        SkASSERT(((size_t)dst & 0x03) == 0);
102dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (((size_t)dst & 0x0F) != 0) {
103dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            *dst = SkPMSrcOver(*src, *dst);
104dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src++;
105dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst++;
106dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count--;
107dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
108dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
109dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        const __m128i *s = reinterpret_cast<const __m128i*>(src);
110dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i *d = reinterpret_cast<__m128i*>(dst);
1119272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#ifdef SK_USE_ACCURATE_BLENDING
112dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
113dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
114dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
115dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (count >= 4) {
116dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Load 4 pixels
117dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
118dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
119dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
120dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
121f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
122dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Shift alphas down to lower 8 bits of each quad.
123dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i alpha = _mm_srli_epi32(src_pixel, 24);
124dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
125dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Copy alpha to upper 3rd byte of each quad
126dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
127dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
128dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Subtract alphas from 255, to get 0..255
129dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            alpha = _mm_sub_epi16(c_255, alpha);
130dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
131dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by red and blue by src alpha.
132dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
133dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by alpha and green by src alpha.
134dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
135dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
136dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // dst_rb_low = (dst_rb >> 8)
137dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
138dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
139dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
140dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
141dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
142dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_add_epi16(dst_rb, c_128);
143dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_srli_epi16(dst_rb, 8);
144dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
145dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
146dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
147dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_add_epi16(dst_ag, c_128);
148dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
149dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
150dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Combine back into RGBA.
151dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
152dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
153dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Add result
154dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
155dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            _mm_store_si128(d, result);
156dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            s++;
157dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            d++;
158dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count -= 4;
159dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
160dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    #else
161dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
162dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
163dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (count >= 4) {
164dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Load 4 pixels
165dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
166dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
167dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
168dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
169f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
170dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
171f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            // (a0, g0, a1, g1, a2, g2, a3, g3)  (low byte of each word)
172f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            __m128i alpha = _mm_srli_epi16(src_pixel, 8);
173f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
174f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            // (a0, a0, a1, a1, a2, g2, a3, g3)
175f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            alpha = _mm_shufflehi_epi16(alpha, 0xF5);
176f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
177f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            // (a0, a0, a1, a1, a2, a2, a3, a3)
178f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            alpha = _mm_shufflelo_epi16(alpha, 0xF5);
179dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
180dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Subtract alphas from 256, to get 1..256
181dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            alpha = _mm_sub_epi16(c_256, alpha);
182dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
183dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by red and blue by src alpha.
184dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
185dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by alpha and green by src alpha.
186dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
187dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
188dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Divide by 256.
189dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_srli_epi16(dst_rb, 8);
190dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
191dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Mask out high bits (already in the right place)
192dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
193dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
194dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Combine back into RGBA.
195dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
196dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
197dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Add result
198dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
199dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            _mm_store_si128(d, result);
200dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            s++;
201dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            d++;
202dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count -= 4;
203dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
2049272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#endif
205dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
206dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        dst = reinterpret_cast<SkPMColor*>(d);
207dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    }
2089272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
2099272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    while (count > 0) {
2109272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        *dst = SkPMSrcOver(*src, *dst);
2119272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        src++;
2129272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        dst++;
2139272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        count--;
2149272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
2159272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org}
2169272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
2174e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
2184e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                               const SkPMColor* SK_RESTRICT src,
2194e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                               int count, U8CPU alpha) {
2209272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    SkASSERT(alpha <= 255);
2219272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    if (count <= 0) {
2229272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        return;
2239272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
2249272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
225dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    if (count >= 4) {
226dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (((size_t)dst & 0x0F) != 0) {
227dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            *dst = SkBlendARGB32(*src, *dst, alpha);
228dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src++;
229dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst++;
230dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count--;
231dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
232dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
233dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        uint32_t src_scale = SkAlpha255To256(alpha);
234dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
235dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        const __m128i *s = reinterpret_cast<const __m128i*>(src);
236dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i *d = reinterpret_cast<__m128i*>(dst);
237dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i src_scale_wide = _mm_set1_epi16(src_scale);
238dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
239dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
240dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (count >= 4) {
241dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Load 4 pixels each of src and dest.
242dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
243dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
244dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
245dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Get red and blue pixels into lower byte of each word.
246dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
247dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
248dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
249dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Get alpha and green into lower byte of each word.
250dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
251dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
252dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
253dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Put per-pixel alpha in low byte of each word.
254dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
255dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
256dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
257dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // dst_alpha = dst_alpha * src_scale
258dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
259dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
260dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Divide by 256.
261dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_alpha = _mm_srli_epi16(dst_alpha, 8);
262dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
263dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Subtract alphas from 256, to get 1..256
264dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
265dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
266dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply red and blue by dst pixel alpha.
267dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
268dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply alpha and green by dst pixel alpha.
269dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
270dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
271dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply red and blue by global alpha.
272dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
273dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply alpha and green by global alpha.
274dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
275dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
276dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Divide by 256.
277dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_srli_epi16(dst_rb, 8);
278dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_rb = _mm_srli_epi16(src_rb, 8);
279dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
280dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Mask out low bits (goodies already in the right place; no need to divide)
281dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
282dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_ag = _mm_andnot_si128(rb_mask, src_ag);
283dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
284dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Combine back into RGBA.
285dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
286dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_pixel = _mm_or_si128(src_rb, src_ag);
287dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
288dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Add two pixels into result.
289dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
290dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            _mm_store_si128(d, result);
291dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            s++;
292dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            d++;
293dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count -= 4;
294dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
295dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
296dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        dst = reinterpret_cast<SkPMColor*>(d);
2979272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
298dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
2999272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    while (count > 0) {
3009272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        *dst = SkBlendARGB32(*src, *dst, alpha);
3019272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        src++;
3029272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        dst++;
3039272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        count--;
3049272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
3059272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org}
306c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
307c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org/* SSE2 version of Color32()
308c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org * portable version is in core/SkBlitRow_D32.cpp
309c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org */
310c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.orgvoid Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
311c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                  SkPMColor color) {
312c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
313c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    if (count <= 0) {
314c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        return;
315c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    }
316c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
317c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    if (0 == color) {
318c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        if (src != dst) {
319c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            memcpy(dst, src, count * sizeof(SkPMColor));
320c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        }
321c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    }
322c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
323c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    unsigned colorA = SkGetPackedA32(color);
324c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    if (255 == colorA) {
325c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        sk_memset32(dst, color, count);
326c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    } else {
327c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        unsigned scale = 256 - SkAlpha255To256(colorA);
328c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
329c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        if (count >= 4) {
330c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            SkASSERT(((size_t)dst & 0x03) == 0);
331c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            while (((size_t)dst & 0x0F) != 0) {
332c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                *dst = color + SkAlphaMulQ(*src, scale);
333c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src++;
334c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                dst++;
335c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                count--;
336c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            }
337c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
338c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            const __m128i *s = reinterpret_cast<const __m128i*>(src);
339c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            __m128i *d = reinterpret_cast<__m128i*>(dst);
340c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
341c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            __m128i src_scale_wide = _mm_set1_epi16(scale);
342c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            __m128i color_wide = _mm_set1_epi32(color);
343c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            while (count >= 4) {
344c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Load 4 pixels each of src and dest.
345c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                __m128i src_pixel = _mm_loadu_si128(s);
346c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
347c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Get red and blue pixels into lower byte of each word.
348c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
349981d4798007b91e2e19c13b171583927a56df63breed@google.com
350c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Get alpha and green into lower byte of each word.
351c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
352c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
353c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Multiply by scale.
354c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
355c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
356c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
357c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Divide by 256.
358c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src_rb = _mm_srli_epi16(src_rb, 8);
359c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src_ag = _mm_andnot_si128(rb_mask, src_ag);
360c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
361c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Combine back into RGBA.
362c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src_pixel = _mm_or_si128(src_rb, src_ag);
363c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
364c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Add color to result.
365c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                __m128i result = _mm_add_epi8(color_wide, src_pixel);
366c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
367c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Store result.
368c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                _mm_store_si128(d, result);
369c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                s++;
370c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                d++;
371c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                count -= 4;
372c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            }
373c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            src = reinterpret_cast<const SkPMColor*>(s);
374c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            dst = reinterpret_cast<SkPMColor*>(d);
375c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org         }
376c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
377c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        while (count > 0) {
378c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            *dst = color + SkAlphaMulQ(*src, scale);
379c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            src += 1;
380c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            dst += 1;
381c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            count--;
382981d4798007b91e2e19c13b171583927a56df63breed@google.com        }
383c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    }
384c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org}
385981d4798007b91e2e19c13b171583927a56df63breed@google.com
386edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.comvoid SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
387edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.com                               size_t maskRB, SkColor origColor,
388edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.com                               int width, int height)
389981d4798007b91e2e19c13b171583927a56df63breed@google.com{
390ee467ee79d449ebe6ae7f7946e613cc70a479c69reed@google.com    SkPMColor color = SkPreMultiplyColor(origColor);
391981d4798007b91e2e19c13b171583927a56df63breed@google.com    size_t dstOffset = dstRB - (width << 2);
392981d4798007b91e2e19c13b171583927a56df63breed@google.com    size_t maskOffset = maskRB - width;
393981d4798007b91e2e19c13b171583927a56df63breed@google.com    SkPMColor* dst = (SkPMColor *)device;
394edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.com    const uint8_t* mask = (const uint8_t*)maskPtr;
395981d4798007b91e2e19c13b171583927a56df63breed@google.com    do {
396981d4798007b91e2e19c13b171583927a56df63breed@google.com        int count = width;
397981d4798007b91e2e19c13b171583927a56df63breed@google.com        if (count >= 4) {
398981d4798007b91e2e19c13b171583927a56df63breed@google.com            while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
399981d4798007b91e2e19c13b171583927a56df63breed@google.com                *dst = SkBlendARGB32(color, *dst, *mask);
400981d4798007b91e2e19c13b171583927a56df63breed@google.com                mask++;
401981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst++;
402981d4798007b91e2e19c13b171583927a56df63breed@google.com                count--;
403981d4798007b91e2e19c13b171583927a56df63breed@google.com            }
404981d4798007b91e2e19c13b171583927a56df63breed@google.com            __m128i *d = reinterpret_cast<__m128i*>(dst);
405981d4798007b91e2e19c13b171583927a56df63breed@google.com            __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
406981d4798007b91e2e19c13b171583927a56df63breed@google.com            __m128i c_256 = _mm_set1_epi16(256);
407981d4798007b91e2e19c13b171583927a56df63breed@google.com            __m128i c_1 = _mm_set1_epi16(1);
408981d4798007b91e2e19c13b171583927a56df63breed@google.com            __m128i src_pixel = _mm_set1_epi32(color);
409981d4798007b91e2e19c13b171583927a56df63breed@google.com            while (count >= 4) {
410981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Load 4 pixels each of src and dest.
411981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i dst_pixel = _mm_load_si128(d);
412981d4798007b91e2e19c13b171583927a56df63breed@google.com
413981d4798007b91e2e19c13b171583927a56df63breed@google.com                //set the aphla value
414981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i src_scale_wide =  _mm_set_epi8(0, *(mask+3),\
415981d4798007b91e2e19c13b171583927a56df63breed@google.com                                0, *(mask+3),0, \
416981d4798007b91e2e19c13b171583927a56df63breed@google.com                                *(mask+2),0, *(mask+2),\
417981d4798007b91e2e19c13b171583927a56df63breed@google.com                                0,*(mask+1), 0,*(mask+1),\
418981d4798007b91e2e19c13b171583927a56df63breed@google.com                                0, *mask,0,*mask);
419981d4798007b91e2e19c13b171583927a56df63breed@google.com
420981d4798007b91e2e19c13b171583927a56df63breed@google.com                //call SkAlpha255To256()
421981d4798007b91e2e19c13b171583927a56df63breed@google.com                src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
422981d4798007b91e2e19c13b171583927a56df63breed@google.com
423981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Get red and blue pixels into lower byte of each word.
424981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
425981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
426981d4798007b91e2e19c13b171583927a56df63breed@google.com
427981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Get alpha and green into lower byte of each word.
428981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
429981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
430981d4798007b91e2e19c13b171583927a56df63breed@google.com
431981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Put per-pixel alpha in low byte of each word.
432981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
433981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
434981d4798007b91e2e19c13b171583927a56df63breed@google.com
435981d4798007b91e2e19c13b171583927a56df63breed@google.com                // dst_alpha = dst_alpha * src_scale
436981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
437981d4798007b91e2e19c13b171583927a56df63breed@google.com
438981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Divide by 256.
439981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_alpha = _mm_srli_epi16(dst_alpha, 8);
440981d4798007b91e2e19c13b171583927a56df63breed@google.com
441981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Subtract alphas from 256, to get 1..256
442981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
443981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Multiply red and blue by dst pixel alpha.
444981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
445981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Multiply alpha and green by dst pixel alpha.
446981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
447981d4798007b91e2e19c13b171583927a56df63breed@google.com
448981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Multiply red and blue by global alpha.
449981d4798007b91e2e19c13b171583927a56df63breed@google.com                src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
450981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Multiply alpha and green by global alpha.
451981d4798007b91e2e19c13b171583927a56df63breed@google.com                src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
452981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Divide by 256.
453981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_rb = _mm_srli_epi16(dst_rb, 8);
454981d4798007b91e2e19c13b171583927a56df63breed@google.com                src_rb = _mm_srli_epi16(src_rb, 8);
455981d4798007b91e2e19c13b171583927a56df63breed@google.com
456981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Mask out low bits (goodies already in the right place; no need to divide)
457981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
458981d4798007b91e2e19c13b171583927a56df63breed@google.com                src_ag = _mm_andnot_si128(rb_mask, src_ag);
459981d4798007b91e2e19c13b171583927a56df63breed@google.com
460981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Combine back into RGBA.
461981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_pixel = _mm_or_si128(dst_rb, dst_ag);
462981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
463981d4798007b91e2e19c13b171583927a56df63breed@google.com
464981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Add two pixels into result.
465981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
466981d4798007b91e2e19c13b171583927a56df63breed@google.com                _mm_store_si128(d, result);
467981d4798007b91e2e19c13b171583927a56df63breed@google.com                // load the next 4 pixel
468981d4798007b91e2e19c13b171583927a56df63breed@google.com                mask = mask + 4;
469981d4798007b91e2e19c13b171583927a56df63breed@google.com                d++;
470981d4798007b91e2e19c13b171583927a56df63breed@google.com                count -= 4;
471981d4798007b91e2e19c13b171583927a56df63breed@google.com            }
472981d4798007b91e2e19c13b171583927a56df63breed@google.com            dst = reinterpret_cast<SkPMColor *>(d);
473981d4798007b91e2e19c13b171583927a56df63breed@google.com        }
474981d4798007b91e2e19c13b171583927a56df63breed@google.com        while(count > 0) {
475981d4798007b91e2e19c13b171583927a56df63breed@google.com            *dst= SkBlendARGB32(color, *dst, *mask);
476981d4798007b91e2e19c13b171583927a56df63breed@google.com            dst += 1;
477981d4798007b91e2e19c13b171583927a56df63breed@google.com            mask++;
478981d4798007b91e2e19c13b171583927a56df63breed@google.com            count --;
479981d4798007b91e2e19c13b171583927a56df63breed@google.com        }
480981d4798007b91e2e19c13b171583927a56df63breed@google.com        dst = (SkPMColor *)((char*)dst + dstOffset);
481981d4798007b91e2e19c13b171583927a56df63breed@google.com        mask += maskOffset;
482981d4798007b91e2e19c13b171583927a56df63breed@google.com    } while (--height != 0);
483981d4798007b91e2e19c13b171583927a56df63breed@google.com}
484