SkBlitRow_opts_SSE2.cpp revision dc7de745dd142cdc00ffed7963ebb030a0506f72
19272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org/*
29272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org **
39272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org ** Copyright 2009, The Android Open Source Project
49272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org **
59272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org ** Licensed under the Apache License, Version 2.0 (the "License");
69272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org ** you may not use this file except in compliance with the License.
79272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org ** You may obtain a copy of the License at
89272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org **
99272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org **     http://www.apache.org/licenses/LICENSE-2.0
109272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org **
119272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org ** Unless required by applicable law or agreed to in writing, software
129272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org ** distributed under the License is distributed on an "AS IS" BASIS,
139272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
149272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org ** See the License for the specific language governing permissions and
159272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org ** limitations under the License.
169272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org */
179272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
184e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org#include "SkBlitRow_opts_SSE2.h"
199272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#include "SkColorPriv.h"
209272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
219272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#include <emmintrin.h>
229272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
239272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org/* SSE2 version of S32_Blend_BlitRow32()
249272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org * portable version is in core/SkBlitRow_D32.cpp
259272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org */
264e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
274e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                              const SkPMColor* SK_RESTRICT src,
284e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                              int count, U8CPU alpha) {
299272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    SkASSERT(alpha <= 255);
309272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    if (count <= 0) {
319272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        return;
329272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
339272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
349272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    uint32_t src_scale = SkAlpha255To256(alpha);
359272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    uint32_t dst_scale = 256 - src_scale;
369272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
37dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    if (count >= 4) {
38dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        SkASSERT(((size_t)dst & 0x03) == 0);
39dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (((size_t)dst & 0x0F) != 0) {
40dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
41dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src++;
42dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst++;
43dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count--;
44dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
45dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
46dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        const __m128i *s = reinterpret_cast<const __m128i*>(src);
47dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i *d = reinterpret_cast<__m128i*>(dst);
48dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
49dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i src_scale_wide = _mm_set1_epi16(src_scale);
50dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i dst_scale_wide = _mm_set1_epi16(dst_scale);
51dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (count >= 4) {
52dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Load 4 pixels each of src and dest.
53dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
54dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
55dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
56dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Get red and blue pixels into lower byte of each word.
57dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
58dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
59dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
60dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Get alpha and green into lower byte of each word.
61dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
62dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
63dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
64dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by scale.
65dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
66dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
67dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_mullo_epi16(dst_rb, dst_scale_wide);
68dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_mullo_epi16(dst_ag, dst_scale_wide);
69dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
70dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Divide by 256.
71dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_rb = _mm_srli_epi16(src_rb, 8);
72dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_srli_epi16(dst_rb, 8);
73dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_ag = _mm_andnot_si128(rb_mask, src_ag);
74dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
75dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
76dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Combine back into RGBA.
77dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_pixel = _mm_or_si128(src_rb, src_ag);
78dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
79dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
80dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Add result
81dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
82dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            _mm_store_si128(d, result);
83dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            s++;
84dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            d++;
85dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count -= 4;
86dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
87dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
88dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        dst = reinterpret_cast<SkPMColor*>(d);
899272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
909272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
914e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org    while (count > 0) {
929272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
939272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        src++;
949272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        dst++;
959272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        count--;
969272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
979272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org}
989272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
994e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
1004e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                                const SkPMColor* SK_RESTRICT src,
1014e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                                int count, U8CPU alpha) {
1029272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    SkASSERT(alpha == 255);
1039272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    if (count <= 0) {
1049272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        return;
1059272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
106dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
107dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    if (count >= 4) {
108dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        SkASSERT(((size_t)dst & 0x03) == 0);
109dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (((size_t)dst & 0x0F) != 0) {
110dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            *dst = SkPMSrcOver(*src, *dst);
111dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src++;
112dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst++;
113dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count--;
114dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
115dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
116dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        const __m128i *s = reinterpret_cast<const __m128i*>(src);
117dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i *d = reinterpret_cast<__m128i*>(dst);
1189272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#ifdef SK_USE_ACCURATE_BLENDING
119dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
120dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
121dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
122dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (count >= 4) {
123dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Load 4 pixels
124dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
125dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
126dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
127dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
128dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_ag = _mm_andnot_si128(rb_mask, dst_pixel);
129dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_srli_epi16(dst_ag, 8);
130dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Shift alphas down to lower 8 bits of each quad.
131dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i alpha = _mm_srli_epi32(src_pixel, 24);
132dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
133dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Copy alpha to upper 3rd byte of each quad
134dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
135dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
136dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Subtract alphas from 255, to get 0..255
137dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            alpha = _mm_sub_epi16(c_255, alpha);
138dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
139dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by red and blue by src alpha.
140dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
141dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by alpha and green by src alpha.
142dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
143dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
144dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // dst_rb_low = (dst_rb >> 8)
145dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
146dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
147dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
148dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
149dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
150dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_add_epi16(dst_rb, c_128);
151dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_srli_epi16(dst_rb, 8);
152dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
153dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
154dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
155dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_add_epi16(dst_ag, c_128);
156dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
157dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
158dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Combine back into RGBA.
159dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
160dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
161dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Add result
162dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
163dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            _mm_store_si128(d, result);
164dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            s++;
165dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            d++;
166dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count -= 4;
167dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
168dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    #else
169dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
170dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
171dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (count >= 4) {
172dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Load 4 pixels
173dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
174dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
175dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
176dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
177dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_ag = _mm_andnot_si128(rb_mask, dst_pixel);
178dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_srli_epi16(dst_ag, 8);
179dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Shift alphas down to lower 8 bits of each quad.
180dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i alpha = _mm_srli_epi32(src_pixel, 24);
181dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
182dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Copy alpha to upper 3rd byte of each quad
183dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
184dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
185dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Subtract alphas from 256, to get 1..256
186dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            alpha = _mm_sub_epi16(c_256, alpha);
187dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
188dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by red and blue by src alpha.
189dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
190dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by alpha and green by src alpha.
191dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
192dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
193dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Divide by 256.
194dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_srli_epi16(dst_rb, 8);
195dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
196dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Mask out high bits (already in the right place)
197dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
198dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
199dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Combine back into RGBA.
200dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
201dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
202dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Add result
203dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
204dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            _mm_store_si128(d, result);
205dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            s++;
206dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            d++;
207dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count -= 4;
208dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
2099272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#endif
210dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
211dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        dst = reinterpret_cast<SkPMColor*>(d);
212dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    }
2139272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
2149272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    while (count > 0) {
2159272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        *dst = SkPMSrcOver(*src, *dst);
2169272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        src++;
2179272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        dst++;
2189272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        count--;
2199272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
2209272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org}
2219272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
2224e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
2234e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                               const SkPMColor* SK_RESTRICT src,
2244e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                               int count, U8CPU alpha) {
2259272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    SkASSERT(alpha <= 255);
2269272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    if (count <= 0) {
2279272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        return;
2289272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
2299272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
230dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    if (count >= 4) {
231dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (((size_t)dst & 0x0F) != 0) {
232dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            *dst = SkBlendARGB32(*src, *dst, alpha);
233dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src++;
234dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst++;
235dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count--;
236dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
237dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
238dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        uint32_t src_scale = SkAlpha255To256(alpha);
239dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
240dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        const __m128i *s = reinterpret_cast<const __m128i*>(src);
241dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i *d = reinterpret_cast<__m128i*>(dst);
242dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i src_scale_wide = _mm_set1_epi16(src_scale);
243dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
244dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
245dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (count >= 4) {
246dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Load 4 pixels each of src and dest.
247dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
248dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
249dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
250dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Get red and blue pixels into lower byte of each word.
251dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
252dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
253dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
254dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Get alpha and green into lower byte of each word.
255dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
256dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
257dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
258dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Put per-pixel alpha in low byte of each word.
259dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
260dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
261dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
262dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // dst_alpha = dst_alpha * src_scale
263dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
264dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
265dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Divide by 256.
266dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_alpha = _mm_srli_epi16(dst_alpha, 8);
267dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
268dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Subtract alphas from 256, to get 1..256
269dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
270dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
271dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply red and blue by dst pixel alpha.
272dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
273dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply alpha and green by dst pixel alpha.
274dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
275dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
276dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply red and blue by global alpha.
277dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
278dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply alpha and green by global alpha.
279dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
280dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
281dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Divide by 256.
282dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_srli_epi16(dst_rb, 8);
283dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_rb = _mm_srli_epi16(src_rb, 8);
284dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
285dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Mask out low bits (goodies already in the right place; no need to divide)
286dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
287dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_ag = _mm_andnot_si128(rb_mask, src_ag);
288dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
289dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Combine back into RGBA.
290dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
291dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_pixel = _mm_or_si128(src_rb, src_ag);
292dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
293dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Add two pixels into result.
294dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
295dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            _mm_store_si128(d, result);
296dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            s++;
297dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            d++;
298dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count -= 4;
299dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
300dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
301dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        dst = reinterpret_cast<SkPMColor*>(d);
3029272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
303dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
3049272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    while (count > 0) {
3059272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        *dst = SkBlendARGB32(*src, *dst, alpha);
3069272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        src++;
3079272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        dst++;
3089272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        count--;
3099272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
3109272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org}
311