SkBlitRow_opts_SSE2.cpp revision 40528743dbb9ce7f39f093e0cdc47849ac8887cf
1/*
2 **
3 ** Copyright 2009, The Android Open Source Project
4 **
5 ** Licensed under the Apache License, Version 2.0 (the "License");
6 ** you may not use this file except in compliance with the License.
7 ** You may obtain a copy of the License at
8 **
9 **     http://www.apache.org/licenses/LICENSE-2.0
10 **
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 */
17
18#include "SkBlitRow_opts_SSE2.h"
19#include "SkColorPriv.h"
20#include "SkUtils.h"
21
22#include <emmintrin.h>
23
24/* SSE2 version of S32_Blend_BlitRow32()
25 * portable version is in core/SkBlitRow_D32.cpp
26 */
27void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
28                              const SkPMColor* SK_RESTRICT src,
29                              int count, U8CPU alpha) {
30    SkASSERT(alpha <= 255);
31    if (count <= 0) {
32        return;
33    }
34
35    uint32_t src_scale = SkAlpha255To256(alpha);
36    uint32_t dst_scale = 256 - src_scale;
37
38    if (count >= 4) {
39        SkASSERT(((size_t)dst & 0x03) == 0);
40        while (((size_t)dst & 0x0F) != 0) {
41            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
42            src++;
43            dst++;
44            count--;
45        }
46
47        const __m128i *s = reinterpret_cast<const __m128i*>(src);
48        __m128i *d = reinterpret_cast<__m128i*>(dst);
49        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
50        __m128i src_scale_wide = _mm_set1_epi16(src_scale);
51        __m128i dst_scale_wide = _mm_set1_epi16(dst_scale);
52        while (count >= 4) {
53            // Load 4 pixels each of src and dest.
54            __m128i src_pixel = _mm_loadu_si128(s);
55            __m128i dst_pixel = _mm_load_si128(d);
56
57            // Get red and blue pixels into lower byte of each word.
58            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
59            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
60
61            // Get alpha and green into lower byte of each word.
62            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
63            __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
64
65            // Multiply by scale.
66            src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
67            src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
68            dst_rb = _mm_mullo_epi16(dst_rb, dst_scale_wide);
69            dst_ag = _mm_mullo_epi16(dst_ag, dst_scale_wide);
70
71            // Divide by 256.
72            src_rb = _mm_srli_epi16(src_rb, 8);
73            dst_rb = _mm_srli_epi16(dst_rb, 8);
74            src_ag = _mm_andnot_si128(rb_mask, src_ag);
75            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
76
77            // Combine back into RGBA.
78            src_pixel = _mm_or_si128(src_rb, src_ag);
79            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
80
81            // Add result
82            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
83            _mm_store_si128(d, result);
84            s++;
85            d++;
86            count -= 4;
87        }
88        src = reinterpret_cast<const SkPMColor*>(s);
89        dst = reinterpret_cast<SkPMColor*>(d);
90    }
91
92    while (count > 0) {
93        *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
94        src++;
95        dst++;
96        count--;
97    }
98}
99
100void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
101                                const SkPMColor* SK_RESTRICT src,
102                                int count, U8CPU alpha) {
103    SkASSERT(alpha == 255);
104    if (count <= 0) {
105        return;
106    }
107
108    if (count >= 4) {
109        SkASSERT(((size_t)dst & 0x03) == 0);
110        while (((size_t)dst & 0x0F) != 0) {
111            *dst = SkPMSrcOver(*src, *dst);
112            src++;
113            dst++;
114            count--;
115        }
116
117        const __m128i *s = reinterpret_cast<const __m128i*>(src);
118        __m128i *d = reinterpret_cast<__m128i*>(dst);
119#ifdef SK_USE_ACCURATE_BLENDING
120        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
121        __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
122        __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
123        while (count >= 4) {
124            // Load 4 pixels
125            __m128i src_pixel = _mm_loadu_si128(s);
126            __m128i dst_pixel = _mm_load_si128(d);
127
128            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
129            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
130            // Shift alphas down to lower 8 bits of each quad.
131            __m128i alpha = _mm_srli_epi32(src_pixel, 24);
132
133            // Copy alpha to upper 3rd byte of each quad
134            alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
135
136            // Subtract alphas from 255, to get 0..255
137            alpha = _mm_sub_epi16(c_255, alpha);
138
139            // Multiply by red and blue by src alpha.
140            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
141            // Multiply by alpha and green by src alpha.
142            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
143
144            // dst_rb_low = (dst_rb >> 8)
145            __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
146            __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
147
148            // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
149            dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
150            dst_rb = _mm_add_epi16(dst_rb, c_128);
151            dst_rb = _mm_srli_epi16(dst_rb, 8);
152
153            // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
154            dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
155            dst_ag = _mm_add_epi16(dst_ag, c_128);
156            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
157
158            // Combine back into RGBA.
159            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
160
161            // Add result
162            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
163            _mm_store_si128(d, result);
164            s++;
165            d++;
166            count -= 4;
167        }
168    #else
169        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
170        __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
171        while (count >= 4) {
172            // Load 4 pixels
173            __m128i src_pixel = _mm_loadu_si128(s);
174            __m128i dst_pixel = _mm_load_si128(d);
175
176            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
177            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
178
179            // (a0, g0, a1, g1, a2, g2, a3, g3)  (low byte of each word)
180            __m128i alpha = _mm_srli_epi16(src_pixel, 8);
181
182            // (a0, a0, a1, a1, a2, g2, a3, g3)
183            alpha = _mm_shufflehi_epi16(alpha, 0xF5);
184
185            // (a0, a0, a1, a1, a2, a2, a3, a3)
186            alpha = _mm_shufflelo_epi16(alpha, 0xF5);
187
188            // Subtract alphas from 256, to get 1..256
189            alpha = _mm_sub_epi16(c_256, alpha);
190
191            // Multiply by red and blue by src alpha.
192            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
193            // Multiply by alpha and green by src alpha.
194            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
195
196            // Divide by 256.
197            dst_rb = _mm_srli_epi16(dst_rb, 8);
198
199            // Mask out high bits (already in the right place)
200            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
201
202            // Combine back into RGBA.
203            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
204
205            // Add result
206            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
207            _mm_store_si128(d, result);
208            s++;
209            d++;
210            count -= 4;
211        }
212#endif
213        src = reinterpret_cast<const SkPMColor*>(s);
214        dst = reinterpret_cast<SkPMColor*>(d);
215    }
216
217    while (count > 0) {
218        *dst = SkPMSrcOver(*src, *dst);
219        src++;
220        dst++;
221        count--;
222    }
223}
224
225void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
226                               const SkPMColor* SK_RESTRICT src,
227                               int count, U8CPU alpha) {
228    SkASSERT(alpha <= 255);
229    if (count <= 0) {
230        return;
231    }
232
233    if (count >= 4) {
234        while (((size_t)dst & 0x0F) != 0) {
235            *dst = SkBlendARGB32(*src, *dst, alpha);
236            src++;
237            dst++;
238            count--;
239        }
240
241        uint32_t src_scale = SkAlpha255To256(alpha);
242
243        const __m128i *s = reinterpret_cast<const __m128i*>(src);
244        __m128i *d = reinterpret_cast<__m128i*>(dst);
245        __m128i src_scale_wide = _mm_set1_epi16(src_scale);
246        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
247        __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
248        while (count >= 4) {
249            // Load 4 pixels each of src and dest.
250            __m128i src_pixel = _mm_loadu_si128(s);
251            __m128i dst_pixel = _mm_load_si128(d);
252
253            // Get red and blue pixels into lower byte of each word.
254            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
255            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
256
257            // Get alpha and green into lower byte of each word.
258            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
259            __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
260
261            // Put per-pixel alpha in low byte of each word.
262            __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
263            dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
264
265            // dst_alpha = dst_alpha * src_scale
266            dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
267
268            // Divide by 256.
269            dst_alpha = _mm_srli_epi16(dst_alpha, 8);
270
271            // Subtract alphas from 256, to get 1..256
272            dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
273
274            // Multiply red and blue by dst pixel alpha.
275            dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
276            // Multiply alpha and green by dst pixel alpha.
277            dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
278
279            // Multiply red and blue by global alpha.
280            src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
281            // Multiply alpha and green by global alpha.
282            src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
283
284            // Divide by 256.
285            dst_rb = _mm_srli_epi16(dst_rb, 8);
286            src_rb = _mm_srli_epi16(src_rb, 8);
287
288            // Mask out low bits (goodies already in the right place; no need to divide)
289            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
290            src_ag = _mm_andnot_si128(rb_mask, src_ag);
291
292            // Combine back into RGBA.
293            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
294            src_pixel = _mm_or_si128(src_rb, src_ag);
295
296            // Add two pixels into result.
297            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
298            _mm_store_si128(d, result);
299            s++;
300            d++;
301            count -= 4;
302        }
303        src = reinterpret_cast<const SkPMColor*>(s);
304        dst = reinterpret_cast<SkPMColor*>(d);
305    }
306
307    while (count > 0) {
308        *dst = SkBlendARGB32(*src, *dst, alpha);
309        src++;
310        dst++;
311        count--;
312    }
313}
314
315/* SSE2 version of Color32()
316 * portable version is in core/SkBlitRow_D32.cpp
317 */
318void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
319                  SkPMColor color) {
320
321    if (count <= 0) {
322        return;
323    }
324
325    if (0 == color) {
326        if (src != dst) {
327            memcpy(dst, src, count * sizeof(SkPMColor));
328        }
329    }
330
331    unsigned colorA = SkGetPackedA32(color);
332    if (255 == colorA) {
333        sk_memset32(dst, color, count);
334    } else {
335        unsigned scale = 256 - SkAlpha255To256(colorA);
336
337        if (count >= 4) {
338            SkASSERT(((size_t)dst & 0x03) == 0);
339            while (((size_t)dst & 0x0F) != 0) {
340                *dst = color + SkAlphaMulQ(*src, scale);
341                src++;
342                dst++;
343                count--;
344            }
345
346            const __m128i *s = reinterpret_cast<const __m128i*>(src);
347            __m128i *d = reinterpret_cast<__m128i*>(dst);
348            __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
349            __m128i src_scale_wide = _mm_set1_epi16(scale);
350            __m128i color_wide = _mm_set1_epi32(color);
351            while (count >= 4) {
352                // Load 4 pixels each of src and dest.
353                __m128i src_pixel = _mm_loadu_si128(s);
354
355                // Get red and blue pixels into lower byte of each word.
356                __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
357
358                // Get alpha and green into lower byte of each word.
359                __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
360
361                // Multiply by scale.
362                src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
363                src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
364
365                // Divide by 256.
366                src_rb = _mm_srli_epi16(src_rb, 8);
367                src_ag = _mm_andnot_si128(rb_mask, src_ag);
368
369                // Combine back into RGBA.
370                src_pixel = _mm_or_si128(src_rb, src_ag);
371
372                // Add color to result.
373                __m128i result = _mm_add_epi8(color_wide, src_pixel);
374
375                // Store result.
376                _mm_store_si128(d, result);
377                s++;
378                d++;
379                count -= 4;
380            }
381            src = reinterpret_cast<const SkPMColor*>(s);
382            dst = reinterpret_cast<SkPMColor*>(d);
383         }
384
385        while (count > 0) {
386            *dst = color + SkAlphaMulQ(*src, scale);
387            src += 1;
388            dst += 1;
389            count--;
390        }
391    }
392}
393