1/*
2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
4 *
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission.  Red Hat makes no representations about the
12 * suitability of this software for any purpose.  It is provided "as is"
13 * without express or implied warranty.
14 *
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22 * SOFTWARE.
23 *
24 * Author:  Rodrigo Kumpera (kumpera@gmail.com)
25 *          André Tupinambá (andrelrt@gmail.com)
26 *
27 * Based on work by Owen Taylor and Søren Sandmann
28 */
29#ifdef HAVE_CONFIG_H
30#include <config.h>
31#endif
32
33#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
34#include <emmintrin.h> /* for SSE2 intrinsics */
35#include "pixman-private.h"
36#include "pixman-combine32.h"
37#include "pixman-inlines.h"
38
39static __m128i mask_0080;
40static __m128i mask_00ff;
41static __m128i mask_0101;
42static __m128i mask_ffff;
43static __m128i mask_ff000000;
44static __m128i mask_alpha;
45
46static __m128i mask_565_r;
47static __m128i mask_565_g1, mask_565_g2;
48static __m128i mask_565_b;
49static __m128i mask_red;
50static __m128i mask_green;
51static __m128i mask_blue;
52
53static __m128i mask_565_fix_rb;
54static __m128i mask_565_fix_g;
55
56static __m128i mask_565_rb;
57static __m128i mask_565_pack_multiplier;
58
59static force_inline __m128i
60unpack_32_1x128 (uint32_t data)
61{
62    return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
63}
64
65static force_inline void
66unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
67{
68    *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
69    *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
70}
71
72static force_inline __m128i
73unpack_565_to_8888 (__m128i lo)
74{
75    __m128i r, g, b, rb, t;
76
77    r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
78    g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
79    b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
80
81    rb = _mm_or_si128 (r, b);
82    t  = _mm_and_si128 (rb, mask_565_fix_rb);
83    t  = _mm_srli_epi32 (t, 5);
84    rb = _mm_or_si128 (rb, t);
85
86    t  = _mm_and_si128 (g, mask_565_fix_g);
87    t  = _mm_srli_epi32 (t, 6);
88    g  = _mm_or_si128 (g, t);
89
90    return _mm_or_si128 (rb, g);
91}
92
93static force_inline void
94unpack_565_128_4x128 (__m128i  data,
95                      __m128i* data0,
96                      __m128i* data1,
97                      __m128i* data2,
98                      __m128i* data3)
99{
100    __m128i lo, hi;
101
102    lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
103    hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
104
105    lo = unpack_565_to_8888 (lo);
106    hi = unpack_565_to_8888 (hi);
107
108    unpack_128_2x128 (lo, data0, data1);
109    unpack_128_2x128 (hi, data2, data3);
110}
111
112static force_inline uint16_t
113pack_565_32_16 (uint32_t pixel)
114{
115    return (uint16_t) (((pixel >> 8) & 0xf800) |
116		       ((pixel >> 5) & 0x07e0) |
117		       ((pixel >> 3) & 0x001f));
118}
119
120static force_inline __m128i
121pack_2x128_128 (__m128i lo, __m128i hi)
122{
123    return _mm_packus_epi16 (lo, hi);
124}
125
126static force_inline __m128i
127pack_565_2packedx128_128 (__m128i lo, __m128i hi)
128{
129    __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
130    __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
131
132    __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
133    __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
134
135    __m128i g0 = _mm_and_si128 (lo, mask_green);
136    __m128i g1 = _mm_and_si128 (hi, mask_green);
137
138    t0 = _mm_or_si128 (t0, g0);
139    t1 = _mm_or_si128 (t1, g1);
140
141    /* Simulates _mm_packus_epi32 */
142    t0 = _mm_slli_epi32 (t0, 16 - 5);
143    t1 = _mm_slli_epi32 (t1, 16 - 5);
144    t0 = _mm_srai_epi32 (t0, 16);
145    t1 = _mm_srai_epi32 (t1, 16);
146    return _mm_packs_epi32 (t0, t1);
147}
148
149static force_inline __m128i
150pack_565_2x128_128 (__m128i lo, __m128i hi)
151{
152    __m128i data;
153    __m128i r, g1, g2, b;
154
155    data = pack_2x128_128 (lo, hi);
156
157    r  = _mm_and_si128 (data, mask_565_r);
158    g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
159    g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
160    b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
161
162    return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
163}
164
165static force_inline __m128i
166pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
167{
168    return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
169			     pack_565_2x128_128 (*xmm2, *xmm3));
170}
171
172static force_inline int
173is_opaque (__m128i x)
174{
175    __m128i ffs = _mm_cmpeq_epi8 (x, x);
176
177    return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
178}
179
180static force_inline int
181is_zero (__m128i x)
182{
183    return _mm_movemask_epi8 (
184	_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
185}
186
187static force_inline int
188is_transparent (__m128i x)
189{
190    return (_mm_movemask_epi8 (
191		_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
192}
193
194static force_inline __m128i
195expand_pixel_32_1x128 (uint32_t data)
196{
197    return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
198}
199
200static force_inline __m128i
201expand_alpha_1x128 (__m128i data)
202{
203    return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
204						     _MM_SHUFFLE (3, 3, 3, 3)),
205				_MM_SHUFFLE (3, 3, 3, 3));
206}
207
208static force_inline void
209expand_alpha_2x128 (__m128i  data_lo,
210                    __m128i  data_hi,
211                    __m128i* alpha_lo,
212                    __m128i* alpha_hi)
213{
214    __m128i lo, hi;
215
216    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
217    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
218
219    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
220    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
221}
222
223static force_inline void
224expand_alpha_rev_2x128 (__m128i  data_lo,
225                        __m128i  data_hi,
226                        __m128i* alpha_lo,
227                        __m128i* alpha_hi)
228{
229    __m128i lo, hi;
230
231    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
232    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
233    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
234    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
235}
236
237static force_inline void
238pix_multiply_2x128 (__m128i* data_lo,
239                    __m128i* data_hi,
240                    __m128i* alpha_lo,
241                    __m128i* alpha_hi,
242                    __m128i* ret_lo,
243                    __m128i* ret_hi)
244{
245    __m128i lo, hi;
246
247    lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
248    hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
249    lo = _mm_adds_epu16 (lo, mask_0080);
250    hi = _mm_adds_epu16 (hi, mask_0080);
251    *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
252    *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
253}
254
255static force_inline void
256pix_add_multiply_2x128 (__m128i* src_lo,
257                        __m128i* src_hi,
258                        __m128i* alpha_dst_lo,
259                        __m128i* alpha_dst_hi,
260                        __m128i* dst_lo,
261                        __m128i* dst_hi,
262                        __m128i* alpha_src_lo,
263                        __m128i* alpha_src_hi,
264                        __m128i* ret_lo,
265                        __m128i* ret_hi)
266{
267    __m128i t1_lo, t1_hi;
268    __m128i t2_lo, t2_hi;
269
270    pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
271    pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
272
273    *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
274    *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
275}
276
277static force_inline void
278negate_2x128 (__m128i  data_lo,
279              __m128i  data_hi,
280              __m128i* neg_lo,
281              __m128i* neg_hi)
282{
283    *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
284    *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
285}
286
287static force_inline void
288invert_colors_2x128 (__m128i  data_lo,
289                     __m128i  data_hi,
290                     __m128i* inv_lo,
291                     __m128i* inv_hi)
292{
293    __m128i lo, hi;
294
295    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
296    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
297    *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
298    *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
299}
300
301static force_inline void
302over_2x128 (__m128i* src_lo,
303            __m128i* src_hi,
304            __m128i* alpha_lo,
305            __m128i* alpha_hi,
306            __m128i* dst_lo,
307            __m128i* dst_hi)
308{
309    __m128i t1, t2;
310
311    negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
312
313    pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
314
315    *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
316    *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
317}
318
319static force_inline void
320over_rev_non_pre_2x128 (__m128i  src_lo,
321                        __m128i  src_hi,
322                        __m128i* dst_lo,
323                        __m128i* dst_hi)
324{
325    __m128i lo, hi;
326    __m128i alpha_lo, alpha_hi;
327
328    expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
329
330    lo = _mm_or_si128 (alpha_lo, mask_alpha);
331    hi = _mm_or_si128 (alpha_hi, mask_alpha);
332
333    invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
334
335    pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
336
337    over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
338}
339
340static force_inline void
341in_over_2x128 (__m128i* src_lo,
342               __m128i* src_hi,
343               __m128i* alpha_lo,
344               __m128i* alpha_hi,
345               __m128i* mask_lo,
346               __m128i* mask_hi,
347               __m128i* dst_lo,
348               __m128i* dst_hi)
349{
350    __m128i s_lo, s_hi;
351    __m128i a_lo, a_hi;
352
353    pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
354    pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
355
356    over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
357}
358
359/* load 4 pixels from a 16-byte boundary aligned address */
360static force_inline __m128i
361load_128_aligned (__m128i* src)
362{
363    return _mm_load_si128 (src);
364}
365
366/* load 4 pixels from a unaligned address */
367static force_inline __m128i
368load_128_unaligned (const __m128i* src)
369{
370    return _mm_loadu_si128 (src);
371}
372
373/* save 4 pixels using Write Combining memory on a 16-byte
374 * boundary aligned address
375 */
376static force_inline void
377save_128_write_combining (__m128i* dst,
378                          __m128i  data)
379{
380    _mm_stream_si128 (dst, data);
381}
382
383/* save 4 pixels on a 16-byte boundary aligned address */
384static force_inline void
385save_128_aligned (__m128i* dst,
386                  __m128i  data)
387{
388    _mm_store_si128 (dst, data);
389}
390
391/* save 4 pixels on a unaligned address */
392static force_inline void
393save_128_unaligned (__m128i* dst,
394                    __m128i  data)
395{
396    _mm_storeu_si128 (dst, data);
397}
398
399static force_inline __m128i
400load_32_1x128 (uint32_t data)
401{
402    return _mm_cvtsi32_si128 (data);
403}
404
405static force_inline __m128i
406expand_alpha_rev_1x128 (__m128i data)
407{
408    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
409}
410
411static force_inline __m128i
412expand_pixel_8_1x128 (uint8_t data)
413{
414    return _mm_shufflelo_epi16 (
415	unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
416}
417
418static force_inline __m128i
419pix_multiply_1x128 (__m128i data,
420		    __m128i alpha)
421{
422    return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
423					    mask_0080),
424			    mask_0101);
425}
426
427static force_inline __m128i
428pix_add_multiply_1x128 (__m128i* src,
429			__m128i* alpha_dst,
430			__m128i* dst,
431			__m128i* alpha_src)
432{
433    __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
434    __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
435
436    return _mm_adds_epu8 (t1, t2);
437}
438
439static force_inline __m128i
440negate_1x128 (__m128i data)
441{
442    return _mm_xor_si128 (data, mask_00ff);
443}
444
445static force_inline __m128i
446invert_colors_1x128 (__m128i data)
447{
448    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
449}
450
451static force_inline __m128i
452over_1x128 (__m128i src, __m128i alpha, __m128i dst)
453{
454    return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
455}
456
457static force_inline __m128i
458in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
459{
460    return over_1x128 (pix_multiply_1x128 (*src, *mask),
461		       pix_multiply_1x128 (*alpha, *mask),
462		       *dst);
463}
464
465static force_inline __m128i
466over_rev_non_pre_1x128 (__m128i src, __m128i dst)
467{
468    __m128i alpha = expand_alpha_1x128 (src);
469
470    return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
471					   _mm_or_si128 (alpha, mask_alpha)),
472		       alpha,
473		       dst);
474}
475
476static force_inline uint32_t
477pack_1x128_32 (__m128i data)
478{
479    return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
480}
481
482static force_inline __m128i
483expand565_16_1x128 (uint16_t pixel)
484{
485    __m128i m = _mm_cvtsi32_si128 (pixel);
486
487    m = unpack_565_to_8888 (m);
488
489    return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
490}
491
492static force_inline uint32_t
493core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
494{
495    uint8_t a;
496    __m128i xmms;
497
498    a = src >> 24;
499
500    if (a == 0xff)
501    {
502	return src;
503    }
504    else if (src)
505    {
506	xmms = unpack_32_1x128 (src);
507	return pack_1x128_32 (
508	    over_1x128 (xmms, expand_alpha_1x128 (xmms),
509			unpack_32_1x128 (dst)));
510    }
511
512    return dst;
513}
514
515static force_inline uint32_t
516combine1 (const uint32_t *ps, const uint32_t *pm)
517{
518    uint32_t s = *ps;
519
520    if (pm)
521    {
522	__m128i ms, mm;
523
524	mm = unpack_32_1x128 (*pm);
525	mm = expand_alpha_1x128 (mm);
526
527	ms = unpack_32_1x128 (s);
528	ms = pix_multiply_1x128 (ms, mm);
529
530	s = pack_1x128_32 (ms);
531    }
532
533    return s;
534}
535
536static force_inline __m128i
537combine4 (const __m128i *ps, const __m128i *pm)
538{
539    __m128i xmm_src_lo, xmm_src_hi;
540    __m128i xmm_msk_lo, xmm_msk_hi;
541    __m128i s;
542
543    if (pm)
544    {
545	xmm_msk_lo = load_128_unaligned (pm);
546
547	if (is_transparent (xmm_msk_lo))
548	    return _mm_setzero_si128 ();
549    }
550
551    s = load_128_unaligned (ps);
552
553    if (pm)
554    {
555	unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
556	unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
557
558	expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
559
560	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
561			    &xmm_msk_lo, &xmm_msk_hi,
562			    &xmm_src_lo, &xmm_src_hi);
563
564	s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
565    }
566
567    return s;
568}
569
570static force_inline void
571core_combine_over_u_sse2_mask (uint32_t *	  pd,
572			       const uint32_t*    ps,
573			       const uint32_t*    pm,
574			       int                w)
575{
576    uint32_t s, d;
577
578    /* Align dst on a 16-byte boundary */
579    while (w && ((uintptr_t)pd & 15))
580    {
581	d = *pd;
582	s = combine1 (ps, pm);
583
584	if (s)
585	    *pd = core_combine_over_u_pixel_sse2 (s, d);
586	pd++;
587	ps++;
588	pm++;
589	w--;
590    }
591
592    while (w >= 4)
593    {
594	__m128i mask = load_128_unaligned ((__m128i *)pm);
595
596	if (!is_zero (mask))
597	{
598	    __m128i src;
599	    __m128i src_hi, src_lo;
600	    __m128i mask_hi, mask_lo;
601	    __m128i alpha_hi, alpha_lo;
602
603	    src = load_128_unaligned ((__m128i *)ps);
604
605	    if (is_opaque (_mm_and_si128 (src, mask)))
606	    {
607		save_128_aligned ((__m128i *)pd, src);
608	    }
609	    else
610	    {
611		__m128i dst = load_128_aligned ((__m128i *)pd);
612		__m128i dst_hi, dst_lo;
613
614		unpack_128_2x128 (mask, &mask_lo, &mask_hi);
615		unpack_128_2x128 (src, &src_lo, &src_hi);
616
617		expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
618		pix_multiply_2x128 (&src_lo, &src_hi,
619				    &mask_lo, &mask_hi,
620				    &src_lo, &src_hi);
621
622		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
623
624		expand_alpha_2x128 (src_lo, src_hi,
625				    &alpha_lo, &alpha_hi);
626
627		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
628			    &dst_lo, &dst_hi);
629
630		save_128_aligned (
631		    (__m128i *)pd,
632		    pack_2x128_128 (dst_lo, dst_hi));
633	    }
634	}
635
636	pm += 4;
637	ps += 4;
638	pd += 4;
639	w -= 4;
640    }
641    while (w)
642    {
643	d = *pd;
644	s = combine1 (ps, pm);
645
646	if (s)
647	    *pd = core_combine_over_u_pixel_sse2 (s, d);
648	pd++;
649	ps++;
650	pm++;
651
652	w--;
653    }
654}
655
656static force_inline void
657core_combine_over_u_sse2_no_mask (uint32_t *	  pd,
658				  const uint32_t*    ps,
659				  int                w)
660{
661    uint32_t s, d;
662
663    /* Align dst on a 16-byte boundary */
664    while (w && ((uintptr_t)pd & 15))
665    {
666	d = *pd;
667	s = *ps;
668
669	if (s)
670	    *pd = core_combine_over_u_pixel_sse2 (s, d);
671	pd++;
672	ps++;
673	w--;
674    }
675
676    while (w >= 4)
677    {
678	__m128i src;
679	__m128i src_hi, src_lo, dst_hi, dst_lo;
680	__m128i alpha_hi, alpha_lo;
681
682	src = load_128_unaligned ((__m128i *)ps);
683
684	if (!is_zero (src))
685	{
686	    if (is_opaque (src))
687	    {
688		save_128_aligned ((__m128i *)pd, src);
689	    }
690	    else
691	    {
692		__m128i dst = load_128_aligned ((__m128i *)pd);
693
694		unpack_128_2x128 (src, &src_lo, &src_hi);
695		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
696
697		expand_alpha_2x128 (src_lo, src_hi,
698				    &alpha_lo, &alpha_hi);
699		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
700			    &dst_lo, &dst_hi);
701
702		save_128_aligned (
703		    (__m128i *)pd,
704		    pack_2x128_128 (dst_lo, dst_hi));
705	    }
706	}
707
708	ps += 4;
709	pd += 4;
710	w -= 4;
711    }
712    while (w)
713    {
714	d = *pd;
715	s = *ps;
716
717	if (s)
718	    *pd = core_combine_over_u_pixel_sse2 (s, d);
719	pd++;
720	ps++;
721
722	w--;
723    }
724}
725
726static force_inline void
727sse2_combine_over_u (pixman_implementation_t *imp,
728                     pixman_op_t              op,
729                     uint32_t *               pd,
730                     const uint32_t *         ps,
731                     const uint32_t *         pm,
732                     int                      w)
733{
734    if (pm)
735	core_combine_over_u_sse2_mask (pd, ps, pm, w);
736    else
737	core_combine_over_u_sse2_no_mask (pd, ps, w);
738}
739
740static void
741sse2_combine_over_reverse_u (pixman_implementation_t *imp,
742                             pixman_op_t              op,
743                             uint32_t *               pd,
744                             const uint32_t *         ps,
745                             const uint32_t *         pm,
746                             int                      w)
747{
748    uint32_t s, d;
749
750    __m128i xmm_dst_lo, xmm_dst_hi;
751    __m128i xmm_src_lo, xmm_src_hi;
752    __m128i xmm_alpha_lo, xmm_alpha_hi;
753
754    /* Align dst on a 16-byte boundary */
755    while (w &&
756           ((uintptr_t)pd & 15))
757    {
758	d = *pd;
759	s = combine1 (ps, pm);
760
761	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
762	w--;
763	ps++;
764	if (pm)
765	    pm++;
766    }
767
768    while (w >= 4)
769    {
770	/* I'm loading unaligned because I'm not sure
771	 * about the address alignment.
772	 */
773	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
774	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
775
776	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
777	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
778
779	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
780			    &xmm_alpha_lo, &xmm_alpha_hi);
781
782	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
783		    &xmm_alpha_lo, &xmm_alpha_hi,
784		    &xmm_src_lo, &xmm_src_hi);
785
786	/* rebuid the 4 pixel data and save*/
787	save_128_aligned ((__m128i*)pd,
788			  pack_2x128_128 (xmm_src_lo, xmm_src_hi));
789
790	w -= 4;
791	ps += 4;
792	pd += 4;
793
794	if (pm)
795	    pm += 4;
796    }
797
798    while (w)
799    {
800	d = *pd;
801	s = combine1 (ps, pm);
802
803	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
804	ps++;
805	w--;
806	if (pm)
807	    pm++;
808    }
809}
810
811static force_inline uint32_t
812core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
813{
814    uint32_t maska = src >> 24;
815
816    if (maska == 0)
817    {
818	return 0;
819    }
820    else if (maska != 0xff)
821    {
822	return pack_1x128_32 (
823	    pix_multiply_1x128 (unpack_32_1x128 (dst),
824				expand_alpha_1x128 (unpack_32_1x128 (src))));
825    }
826
827    return dst;
828}
829
830static void
831sse2_combine_in_u (pixman_implementation_t *imp,
832                   pixman_op_t              op,
833                   uint32_t *               pd,
834                   const uint32_t *         ps,
835                   const uint32_t *         pm,
836                   int                      w)
837{
838    uint32_t s, d;
839
840    __m128i xmm_src_lo, xmm_src_hi;
841    __m128i xmm_dst_lo, xmm_dst_hi;
842
843    while (w && ((uintptr_t)pd & 15))
844    {
845	s = combine1 (ps, pm);
846	d = *pd;
847
848	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
849	w--;
850	ps++;
851	if (pm)
852	    pm++;
853    }
854
855    while (w >= 4)
856    {
857	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
858	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
859
860	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
861	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
862
863	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
864	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
865			    &xmm_dst_lo, &xmm_dst_hi,
866			    &xmm_dst_lo, &xmm_dst_hi);
867
868	save_128_aligned ((__m128i*)pd,
869			  pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
870
871	ps += 4;
872	pd += 4;
873	w -= 4;
874	if (pm)
875	    pm += 4;
876    }
877
878    while (w)
879    {
880	s = combine1 (ps, pm);
881	d = *pd;
882
883	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
884	w--;
885	ps++;
886	if (pm)
887	    pm++;
888    }
889}
890
891static void
892sse2_combine_in_reverse_u (pixman_implementation_t *imp,
893                           pixman_op_t              op,
894                           uint32_t *               pd,
895                           const uint32_t *         ps,
896                           const uint32_t *         pm,
897                           int                      w)
898{
899    uint32_t s, d;
900
901    __m128i xmm_src_lo, xmm_src_hi;
902    __m128i xmm_dst_lo, xmm_dst_hi;
903
904    while (w && ((uintptr_t)pd & 15))
905    {
906	s = combine1 (ps, pm);
907	d = *pd;
908
909	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
910	ps++;
911	w--;
912	if (pm)
913	    pm++;
914    }
915
916    while (w >= 4)
917    {
918	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
919	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
920
921	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
922	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
923
924	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
925	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
926			    &xmm_src_lo, &xmm_src_hi,
927			    &xmm_dst_lo, &xmm_dst_hi);
928
929	save_128_aligned (
930	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
931
932	ps += 4;
933	pd += 4;
934	w -= 4;
935	if (pm)
936	    pm += 4;
937    }
938
939    while (w)
940    {
941	s = combine1 (ps, pm);
942	d = *pd;
943
944	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
945	w--;
946	ps++;
947	if (pm)
948	    pm++;
949    }
950}
951
952static void
953sse2_combine_out_reverse_u (pixman_implementation_t *imp,
954                            pixman_op_t              op,
955                            uint32_t *               pd,
956                            const uint32_t *         ps,
957                            const uint32_t *         pm,
958                            int                      w)
959{
960    while (w && ((uintptr_t)pd & 15))
961    {
962	uint32_t s = combine1 (ps, pm);
963	uint32_t d = *pd;
964
965	*pd++ = pack_1x128_32 (
966	    pix_multiply_1x128 (
967		unpack_32_1x128 (d), negate_1x128 (
968		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
969
970	if (pm)
971	    pm++;
972	ps++;
973	w--;
974    }
975
976    while (w >= 4)
977    {
978	__m128i xmm_src_lo, xmm_src_hi;
979	__m128i xmm_dst_lo, xmm_dst_hi;
980
981	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
982	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
983
984	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
985	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
986
987	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
988	negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
989
990	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
991			    &xmm_src_lo, &xmm_src_hi,
992			    &xmm_dst_lo, &xmm_dst_hi);
993
994	save_128_aligned (
995	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
996
997	ps += 4;
998	pd += 4;
999	if (pm)
1000	    pm += 4;
1001
1002	w -= 4;
1003    }
1004
1005    while (w)
1006    {
1007	uint32_t s = combine1 (ps, pm);
1008	uint32_t d = *pd;
1009
1010	*pd++ = pack_1x128_32 (
1011	    pix_multiply_1x128 (
1012		unpack_32_1x128 (d), negate_1x128 (
1013		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
1014	ps++;
1015	if (pm)
1016	    pm++;
1017	w--;
1018    }
1019}
1020
1021static void
1022sse2_combine_out_u (pixman_implementation_t *imp,
1023                    pixman_op_t              op,
1024                    uint32_t *               pd,
1025                    const uint32_t *         ps,
1026                    const uint32_t *         pm,
1027                    int                      w)
1028{
1029    while (w && ((uintptr_t)pd & 15))
1030    {
1031	uint32_t s = combine1 (ps, pm);
1032	uint32_t d = *pd;
1033
1034	*pd++ = pack_1x128_32 (
1035	    pix_multiply_1x128 (
1036		unpack_32_1x128 (s), negate_1x128 (
1037		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
1038	w--;
1039	ps++;
1040	if (pm)
1041	    pm++;
1042    }
1043
1044    while (w >= 4)
1045    {
1046	__m128i xmm_src_lo, xmm_src_hi;
1047	__m128i xmm_dst_lo, xmm_dst_hi;
1048
1049	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1050	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1051
1052	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1053	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1054
1055	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1056	negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1057
1058	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1059			    &xmm_dst_lo, &xmm_dst_hi,
1060			    &xmm_dst_lo, &xmm_dst_hi);
1061
1062	save_128_aligned (
1063	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1064
1065	ps += 4;
1066	pd += 4;
1067	w -= 4;
1068	if (pm)
1069	    pm += 4;
1070    }
1071
1072    while (w)
1073    {
1074	uint32_t s = combine1 (ps, pm);
1075	uint32_t d = *pd;
1076
1077	*pd++ = pack_1x128_32 (
1078	    pix_multiply_1x128 (
1079		unpack_32_1x128 (s), negate_1x128 (
1080		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
1081	w--;
1082	ps++;
1083	if (pm)
1084	    pm++;
1085    }
1086}
1087
1088static force_inline uint32_t
1089core_combine_atop_u_pixel_sse2 (uint32_t src,
1090                                uint32_t dst)
1091{
1092    __m128i s = unpack_32_1x128 (src);
1093    __m128i d = unpack_32_1x128 (dst);
1094
1095    __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1096    __m128i da = expand_alpha_1x128 (d);
1097
1098    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1099}
1100
1101static void
1102sse2_combine_atop_u (pixman_implementation_t *imp,
1103                     pixman_op_t              op,
1104                     uint32_t *               pd,
1105                     const uint32_t *         ps,
1106                     const uint32_t *         pm,
1107                     int                      w)
1108{
1109    uint32_t s, d;
1110
1111    __m128i xmm_src_lo, xmm_src_hi;
1112    __m128i xmm_dst_lo, xmm_dst_hi;
1113    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1114    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1115
1116    while (w && ((uintptr_t)pd & 15))
1117    {
1118	s = combine1 (ps, pm);
1119	d = *pd;
1120
1121	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1122	w--;
1123	ps++;
1124	if (pm)
1125	    pm++;
1126    }
1127
1128    while (w >= 4)
1129    {
1130	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1131	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1132
1133	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1134	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1135
1136	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1137			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1138	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1139			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1140
1141	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1142		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1143
1144	pix_add_multiply_2x128 (
1145	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1146	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1147	    &xmm_dst_lo, &xmm_dst_hi);
1148
1149	save_128_aligned (
1150	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1151
1152	ps += 4;
1153	pd += 4;
1154	w -= 4;
1155	if (pm)
1156	    pm += 4;
1157    }
1158
1159    while (w)
1160    {
1161	s = combine1 (ps, pm);
1162	d = *pd;
1163
1164	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1165	w--;
1166	ps++;
1167	if (pm)
1168	    pm++;
1169    }
1170}
1171
1172static force_inline uint32_t
1173core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1174                                        uint32_t dst)
1175{
1176    __m128i s = unpack_32_1x128 (src);
1177    __m128i d = unpack_32_1x128 (dst);
1178
1179    __m128i sa = expand_alpha_1x128 (s);
1180    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1181
1182    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1183}
1184
1185static void
1186sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
1187                             pixman_op_t              op,
1188                             uint32_t *               pd,
1189                             const uint32_t *         ps,
1190                             const uint32_t *         pm,
1191                             int                      w)
1192{
1193    uint32_t s, d;
1194
1195    __m128i xmm_src_lo, xmm_src_hi;
1196    __m128i xmm_dst_lo, xmm_dst_hi;
1197    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1198    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1199
1200    while (w && ((uintptr_t)pd & 15))
1201    {
1202	s = combine1 (ps, pm);
1203	d = *pd;
1204
1205	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1206	ps++;
1207	w--;
1208	if (pm)
1209	    pm++;
1210    }
1211
1212    while (w >= 4)
1213    {
1214	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1215	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1216
1217	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1218	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1219
1220	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1221			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1222	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1223			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1224
1225	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1226		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1227
1228	pix_add_multiply_2x128 (
1229	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1230	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1231	    &xmm_dst_lo, &xmm_dst_hi);
1232
1233	save_128_aligned (
1234	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1235
1236	ps += 4;
1237	pd += 4;
1238	w -= 4;
1239	if (pm)
1240	    pm += 4;
1241    }
1242
1243    while (w)
1244    {
1245	s = combine1 (ps, pm);
1246	d = *pd;
1247
1248	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1249	ps++;
1250	w--;
1251	if (pm)
1252	    pm++;
1253    }
1254}
1255
1256static force_inline uint32_t
1257core_combine_xor_u_pixel_sse2 (uint32_t src,
1258                               uint32_t dst)
1259{
1260    __m128i s = unpack_32_1x128 (src);
1261    __m128i d = unpack_32_1x128 (dst);
1262
1263    __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1264    __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1265
1266    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1267}
1268
1269static void
1270sse2_combine_xor_u (pixman_implementation_t *imp,
1271                    pixman_op_t              op,
1272                    uint32_t *               dst,
1273                    const uint32_t *         src,
1274                    const uint32_t *         mask,
1275                    int                      width)
1276{
1277    int w = width;
1278    uint32_t s, d;
1279    uint32_t* pd = dst;
1280    const uint32_t* ps = src;
1281    const uint32_t* pm = mask;
1282
1283    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1284    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1285    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1286    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1287
1288    while (w && ((uintptr_t)pd & 15))
1289    {
1290	s = combine1 (ps, pm);
1291	d = *pd;
1292
1293	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1294	w--;
1295	ps++;
1296	if (pm)
1297	    pm++;
1298    }
1299
1300    while (w >= 4)
1301    {
1302	xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1303	xmm_dst = load_128_aligned ((__m128i*) pd);
1304
1305	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1306	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1307
1308	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1309			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1310	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1311			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1312
1313	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1314		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1315	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1316		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1317
1318	pix_add_multiply_2x128 (
1319	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1320	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1321	    &xmm_dst_lo, &xmm_dst_hi);
1322
1323	save_128_aligned (
1324	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1325
1326	ps += 4;
1327	pd += 4;
1328	w -= 4;
1329	if (pm)
1330	    pm += 4;
1331    }
1332
1333    while (w)
1334    {
1335	s = combine1 (ps, pm);
1336	d = *pd;
1337
1338	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1339	w--;
1340	ps++;
1341	if (pm)
1342	    pm++;
1343    }
1344}
1345
1346static force_inline void
1347sse2_combine_add_u (pixman_implementation_t *imp,
1348                    pixman_op_t              op,
1349                    uint32_t *               dst,
1350                    const uint32_t *         src,
1351                    const uint32_t *         mask,
1352                    int                      width)
1353{
1354    int w = width;
1355    uint32_t s, d;
1356    uint32_t* pd = dst;
1357    const uint32_t* ps = src;
1358    const uint32_t* pm = mask;
1359
1360    while (w && (uintptr_t)pd & 15)
1361    {
1362	s = combine1 (ps, pm);
1363	d = *pd;
1364
1365	ps++;
1366	if (pm)
1367	    pm++;
1368	*pd++ = _mm_cvtsi128_si32 (
1369	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1370	w--;
1371    }
1372
1373    while (w >= 4)
1374    {
1375	__m128i s;
1376
1377	s = combine4 ((__m128i*)ps, (__m128i*)pm);
1378
1379	save_128_aligned (
1380	    (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1381
1382	pd += 4;
1383	ps += 4;
1384	if (pm)
1385	    pm += 4;
1386	w -= 4;
1387    }
1388
1389    while (w--)
1390    {
1391	s = combine1 (ps, pm);
1392	d = *pd;
1393
1394	ps++;
1395	*pd++ = _mm_cvtsi128_si32 (
1396	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1397	if (pm)
1398	    pm++;
1399    }
1400}
1401
1402static force_inline uint32_t
1403core_combine_saturate_u_pixel_sse2 (uint32_t src,
1404                                    uint32_t dst)
1405{
1406    __m128i ms = unpack_32_1x128 (src);
1407    __m128i md = unpack_32_1x128 (dst);
1408    uint32_t sa = src >> 24;
1409    uint32_t da = ~dst >> 24;
1410
1411    if (sa > da)
1412    {
1413	ms = pix_multiply_1x128 (
1414	    ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1415    }
1416
1417    return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1418}
1419
1420static void
1421sse2_combine_saturate_u (pixman_implementation_t *imp,
1422                         pixman_op_t              op,
1423                         uint32_t *               pd,
1424                         const uint32_t *         ps,
1425                         const uint32_t *         pm,
1426                         int                      w)
1427{
1428    uint32_t s, d;
1429
1430    uint32_t pack_cmp;
1431    __m128i xmm_src, xmm_dst;
1432
1433    while (w && (uintptr_t)pd & 15)
1434    {
1435	s = combine1 (ps, pm);
1436	d = *pd;
1437
1438	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1439	w--;
1440	ps++;
1441	if (pm)
1442	    pm++;
1443    }
1444
1445    while (w >= 4)
1446    {
1447	xmm_dst = load_128_aligned  ((__m128i*)pd);
1448	xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1449
1450	pack_cmp = _mm_movemask_epi8 (
1451	    _mm_cmpgt_epi32 (
1452		_mm_srli_epi32 (xmm_src, 24),
1453		_mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1454
1455	/* if some alpha src is grater than respective ~alpha dst */
1456	if (pack_cmp)
1457	{
1458	    s = combine1 (ps++, pm);
1459	    d = *pd;
1460	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1461	    if (pm)
1462		pm++;
1463
1464	    s = combine1 (ps++, pm);
1465	    d = *pd;
1466	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1467	    if (pm)
1468		pm++;
1469
1470	    s = combine1 (ps++, pm);
1471	    d = *pd;
1472	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1473	    if (pm)
1474		pm++;
1475
1476	    s = combine1 (ps++, pm);
1477	    d = *pd;
1478	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1479	    if (pm)
1480		pm++;
1481	}
1482	else
1483	{
1484	    save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1485
1486	    pd += 4;
1487	    ps += 4;
1488	    if (pm)
1489		pm += 4;
1490	}
1491
1492	w -= 4;
1493    }
1494
1495    while (w--)
1496    {
1497	s = combine1 (ps, pm);
1498	d = *pd;
1499
1500	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1501	ps++;
1502	if (pm)
1503	    pm++;
1504    }
1505}
1506
1507static void
1508sse2_combine_src_ca (pixman_implementation_t *imp,
1509                     pixman_op_t              op,
1510                     uint32_t *               pd,
1511                     const uint32_t *         ps,
1512                     const uint32_t *         pm,
1513                     int                      w)
1514{
1515    uint32_t s, m;
1516
1517    __m128i xmm_src_lo, xmm_src_hi;
1518    __m128i xmm_mask_lo, xmm_mask_hi;
1519    __m128i xmm_dst_lo, xmm_dst_hi;
1520
1521    while (w && (uintptr_t)pd & 15)
1522    {
1523	s = *ps++;
1524	m = *pm++;
1525	*pd++ = pack_1x128_32 (
1526	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1527	w--;
1528    }
1529
1530    while (w >= 4)
1531    {
1532	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1533	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1534
1535	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1536	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1537
1538	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1539			    &xmm_mask_lo, &xmm_mask_hi,
1540			    &xmm_dst_lo, &xmm_dst_hi);
1541
1542	save_128_aligned (
1543	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1544
1545	ps += 4;
1546	pd += 4;
1547	pm += 4;
1548	w -= 4;
1549    }
1550
1551    while (w)
1552    {
1553	s = *ps++;
1554	m = *pm++;
1555	*pd++ = pack_1x128_32 (
1556	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1557	w--;
1558    }
1559}
1560
1561static force_inline uint32_t
1562core_combine_over_ca_pixel_sse2 (uint32_t src,
1563                                 uint32_t mask,
1564                                 uint32_t dst)
1565{
1566    __m128i s = unpack_32_1x128 (src);
1567    __m128i expAlpha = expand_alpha_1x128 (s);
1568    __m128i unpk_mask = unpack_32_1x128 (mask);
1569    __m128i unpk_dst  = unpack_32_1x128 (dst);
1570
1571    return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1572}
1573
1574static void
1575sse2_combine_over_ca (pixman_implementation_t *imp,
1576                      pixman_op_t              op,
1577                      uint32_t *               pd,
1578                      const uint32_t *         ps,
1579                      const uint32_t *         pm,
1580                      int                      w)
1581{
1582    uint32_t s, m, d;
1583
1584    __m128i xmm_alpha_lo, xmm_alpha_hi;
1585    __m128i xmm_src_lo, xmm_src_hi;
1586    __m128i xmm_dst_lo, xmm_dst_hi;
1587    __m128i xmm_mask_lo, xmm_mask_hi;
1588
1589    while (w && (uintptr_t)pd & 15)
1590    {
1591	s = *ps++;
1592	m = *pm++;
1593	d = *pd;
1594
1595	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1596	w--;
1597    }
1598
1599    while (w >= 4)
1600    {
1601	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1602	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1603	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1604
1605	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1606	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1607	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1608
1609	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1610			    &xmm_alpha_lo, &xmm_alpha_hi);
1611
1612	in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1613		       &xmm_alpha_lo, &xmm_alpha_hi,
1614		       &xmm_mask_lo, &xmm_mask_hi,
1615		       &xmm_dst_lo, &xmm_dst_hi);
1616
1617	save_128_aligned (
1618	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1619
1620	ps += 4;
1621	pd += 4;
1622	pm += 4;
1623	w -= 4;
1624    }
1625
1626    while (w)
1627    {
1628	s = *ps++;
1629	m = *pm++;
1630	d = *pd;
1631
1632	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1633	w--;
1634    }
1635}
1636
1637static force_inline uint32_t
1638core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1639                                         uint32_t mask,
1640                                         uint32_t dst)
1641{
1642    __m128i d = unpack_32_1x128 (dst);
1643
1644    return pack_1x128_32 (
1645	over_1x128 (d, expand_alpha_1x128 (d),
1646		    pix_multiply_1x128 (unpack_32_1x128 (src),
1647					unpack_32_1x128 (mask))));
1648}
1649
1650static void
1651sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
1652                              pixman_op_t              op,
1653                              uint32_t *               pd,
1654                              const uint32_t *         ps,
1655                              const uint32_t *         pm,
1656                              int                      w)
1657{
1658    uint32_t s, m, d;
1659
1660    __m128i xmm_alpha_lo, xmm_alpha_hi;
1661    __m128i xmm_src_lo, xmm_src_hi;
1662    __m128i xmm_dst_lo, xmm_dst_hi;
1663    __m128i xmm_mask_lo, xmm_mask_hi;
1664
1665    while (w && (uintptr_t)pd & 15)
1666    {
1667	s = *ps++;
1668	m = *pm++;
1669	d = *pd;
1670
1671	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1672	w--;
1673    }
1674
1675    while (w >= 4)
1676    {
1677	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1678	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1679	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1680
1681	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1682	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1683	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1684
1685	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1686			    &xmm_alpha_lo, &xmm_alpha_hi);
1687	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1688			    &xmm_mask_lo, &xmm_mask_hi,
1689			    &xmm_mask_lo, &xmm_mask_hi);
1690
1691	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1692		    &xmm_alpha_lo, &xmm_alpha_hi,
1693		    &xmm_mask_lo, &xmm_mask_hi);
1694
1695	save_128_aligned (
1696	    (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1697
1698	ps += 4;
1699	pd += 4;
1700	pm += 4;
1701	w -= 4;
1702    }
1703
1704    while (w)
1705    {
1706	s = *ps++;
1707	m = *pm++;
1708	d = *pd;
1709
1710	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1711	w--;
1712    }
1713}
1714
1715static void
1716sse2_combine_in_ca (pixman_implementation_t *imp,
1717                    pixman_op_t              op,
1718                    uint32_t *               pd,
1719                    const uint32_t *         ps,
1720                    const uint32_t *         pm,
1721                    int                      w)
1722{
1723    uint32_t s, m, d;
1724
1725    __m128i xmm_alpha_lo, xmm_alpha_hi;
1726    __m128i xmm_src_lo, xmm_src_hi;
1727    __m128i xmm_dst_lo, xmm_dst_hi;
1728    __m128i xmm_mask_lo, xmm_mask_hi;
1729
1730    while (w && (uintptr_t)pd & 15)
1731    {
1732	s = *ps++;
1733	m = *pm++;
1734	d = *pd;
1735
1736	*pd++ = pack_1x128_32 (
1737	    pix_multiply_1x128 (
1738		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1739		expand_alpha_1x128 (unpack_32_1x128 (d))));
1740
1741	w--;
1742    }
1743
1744    while (w >= 4)
1745    {
1746	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1747	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1748	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1749
1750	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1751	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1752	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1753
1754	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1755			    &xmm_alpha_lo, &xmm_alpha_hi);
1756
1757	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1758			    &xmm_mask_lo, &xmm_mask_hi,
1759			    &xmm_dst_lo, &xmm_dst_hi);
1760
1761	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1762			    &xmm_alpha_lo, &xmm_alpha_hi,
1763			    &xmm_dst_lo, &xmm_dst_hi);
1764
1765	save_128_aligned (
1766	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1767
1768	ps += 4;
1769	pd += 4;
1770	pm += 4;
1771	w -= 4;
1772    }
1773
1774    while (w)
1775    {
1776	s = *ps++;
1777	m = *pm++;
1778	d = *pd;
1779
1780	*pd++ = pack_1x128_32 (
1781	    pix_multiply_1x128 (
1782		pix_multiply_1x128 (
1783		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
1784		expand_alpha_1x128 (unpack_32_1x128 (d))));
1785
1786	w--;
1787    }
1788}
1789
1790static void
1791sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
1792                            pixman_op_t              op,
1793                            uint32_t *               pd,
1794                            const uint32_t *         ps,
1795                            const uint32_t *         pm,
1796                            int                      w)
1797{
1798    uint32_t s, m, d;
1799
1800    __m128i xmm_alpha_lo, xmm_alpha_hi;
1801    __m128i xmm_src_lo, xmm_src_hi;
1802    __m128i xmm_dst_lo, xmm_dst_hi;
1803    __m128i xmm_mask_lo, xmm_mask_hi;
1804
1805    while (w && (uintptr_t)pd & 15)
1806    {
1807	s = *ps++;
1808	m = *pm++;
1809	d = *pd;
1810
1811	*pd++ = pack_1x128_32 (
1812	    pix_multiply_1x128 (
1813		unpack_32_1x128 (d),
1814		pix_multiply_1x128 (unpack_32_1x128 (m),
1815				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
1816	w--;
1817    }
1818
1819    while (w >= 4)
1820    {
1821	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1822	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1823	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1824
1825	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1826	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1827	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1828
1829	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1830			    &xmm_alpha_lo, &xmm_alpha_hi);
1831	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1832			    &xmm_alpha_lo, &xmm_alpha_hi,
1833			    &xmm_alpha_lo, &xmm_alpha_hi);
1834
1835	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1836			    &xmm_alpha_lo, &xmm_alpha_hi,
1837			    &xmm_dst_lo, &xmm_dst_hi);
1838
1839	save_128_aligned (
1840	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1841
1842	ps += 4;
1843	pd += 4;
1844	pm += 4;
1845	w -= 4;
1846    }
1847
1848    while (w)
1849    {
1850	s = *ps++;
1851	m = *pm++;
1852	d = *pd;
1853
1854	*pd++ = pack_1x128_32 (
1855	    pix_multiply_1x128 (
1856		unpack_32_1x128 (d),
1857		pix_multiply_1x128 (unpack_32_1x128 (m),
1858				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
1859	w--;
1860    }
1861}
1862
1863static void
1864sse2_combine_out_ca (pixman_implementation_t *imp,
1865                     pixman_op_t              op,
1866                     uint32_t *               pd,
1867                     const uint32_t *         ps,
1868                     const uint32_t *         pm,
1869                     int                      w)
1870{
1871    uint32_t s, m, d;
1872
1873    __m128i xmm_alpha_lo, xmm_alpha_hi;
1874    __m128i xmm_src_lo, xmm_src_hi;
1875    __m128i xmm_dst_lo, xmm_dst_hi;
1876    __m128i xmm_mask_lo, xmm_mask_hi;
1877
1878    while (w && (uintptr_t)pd & 15)
1879    {
1880	s = *ps++;
1881	m = *pm++;
1882	d = *pd;
1883
1884	*pd++ = pack_1x128_32 (
1885	    pix_multiply_1x128 (
1886		pix_multiply_1x128 (
1887		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
1888		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1889	w--;
1890    }
1891
1892    while (w >= 4)
1893    {
1894	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1895	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1896	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1897
1898	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1899	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1900	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1901
1902	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1903			    &xmm_alpha_lo, &xmm_alpha_hi);
1904	negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1905		      &xmm_alpha_lo, &xmm_alpha_hi);
1906
1907	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1908			    &xmm_mask_lo, &xmm_mask_hi,
1909			    &xmm_dst_lo, &xmm_dst_hi);
1910	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1911			    &xmm_alpha_lo, &xmm_alpha_hi,
1912			    &xmm_dst_lo, &xmm_dst_hi);
1913
1914	save_128_aligned (
1915	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1916
1917	ps += 4;
1918	pd += 4;
1919	pm += 4;
1920	w -= 4;
1921    }
1922
1923    while (w)
1924    {
1925	s = *ps++;
1926	m = *pm++;
1927	d = *pd;
1928
1929	*pd++ = pack_1x128_32 (
1930	    pix_multiply_1x128 (
1931		pix_multiply_1x128 (
1932		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
1933		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1934
1935	w--;
1936    }
1937}
1938
1939static void
1940sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
1941                             pixman_op_t              op,
1942                             uint32_t *               pd,
1943                             const uint32_t *         ps,
1944                             const uint32_t *         pm,
1945                             int                      w)
1946{
1947    uint32_t s, m, d;
1948
1949    __m128i xmm_alpha_lo, xmm_alpha_hi;
1950    __m128i xmm_src_lo, xmm_src_hi;
1951    __m128i xmm_dst_lo, xmm_dst_hi;
1952    __m128i xmm_mask_lo, xmm_mask_hi;
1953
1954    while (w && (uintptr_t)pd & 15)
1955    {
1956	s = *ps++;
1957	m = *pm++;
1958	d = *pd;
1959
1960	*pd++ = pack_1x128_32 (
1961	    pix_multiply_1x128 (
1962		unpack_32_1x128 (d),
1963		negate_1x128 (pix_multiply_1x128 (
1964				 unpack_32_1x128 (m),
1965				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
1966	w--;
1967    }
1968
1969    while (w >= 4)
1970    {
1971	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1972	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1973	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1974
1975	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1976	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1977	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1978
1979	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1980			    &xmm_alpha_lo, &xmm_alpha_hi);
1981
1982	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1983			    &xmm_alpha_lo, &xmm_alpha_hi,
1984			    &xmm_mask_lo, &xmm_mask_hi);
1985
1986	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1987		      &xmm_mask_lo, &xmm_mask_hi);
1988
1989	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1990			    &xmm_mask_lo, &xmm_mask_hi,
1991			    &xmm_dst_lo, &xmm_dst_hi);
1992
1993	save_128_aligned (
1994	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1995
1996	ps += 4;
1997	pd += 4;
1998	pm += 4;
1999	w -= 4;
2000    }
2001
2002    while (w)
2003    {
2004	s = *ps++;
2005	m = *pm++;
2006	d = *pd;
2007
2008	*pd++ = pack_1x128_32 (
2009	    pix_multiply_1x128 (
2010		unpack_32_1x128 (d),
2011		negate_1x128 (pix_multiply_1x128 (
2012				 unpack_32_1x128 (m),
2013				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
2014	w--;
2015    }
2016}
2017
2018static force_inline uint32_t
2019core_combine_atop_ca_pixel_sse2 (uint32_t src,
2020                                 uint32_t mask,
2021                                 uint32_t dst)
2022{
2023    __m128i m = unpack_32_1x128 (mask);
2024    __m128i s = unpack_32_1x128 (src);
2025    __m128i d = unpack_32_1x128 (dst);
2026    __m128i sa = expand_alpha_1x128 (s);
2027    __m128i da = expand_alpha_1x128 (d);
2028
2029    s = pix_multiply_1x128 (s, m);
2030    m = negate_1x128 (pix_multiply_1x128 (m, sa));
2031
2032    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2033}
2034
2035static void
2036sse2_combine_atop_ca (pixman_implementation_t *imp,
2037                      pixman_op_t              op,
2038                      uint32_t *               pd,
2039                      const uint32_t *         ps,
2040                      const uint32_t *         pm,
2041                      int                      w)
2042{
2043    uint32_t s, m, d;
2044
2045    __m128i xmm_src_lo, xmm_src_hi;
2046    __m128i xmm_dst_lo, xmm_dst_hi;
2047    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2048    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2049    __m128i xmm_mask_lo, xmm_mask_hi;
2050
2051    while (w && (uintptr_t)pd & 15)
2052    {
2053	s = *ps++;
2054	m = *pm++;
2055	d = *pd;
2056
2057	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2058	w--;
2059    }
2060
2061    while (w >= 4)
2062    {
2063	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2064	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2065	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2066
2067	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2068	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2069	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2070
2071	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2072			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2073	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2074			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2075
2076	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2077			    &xmm_mask_lo, &xmm_mask_hi,
2078			    &xmm_src_lo, &xmm_src_hi);
2079	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2080			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2081			    &xmm_mask_lo, &xmm_mask_hi);
2082
2083	negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2084
2085	pix_add_multiply_2x128 (
2086	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2087	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2088	    &xmm_dst_lo, &xmm_dst_hi);
2089
2090	save_128_aligned (
2091	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2092
2093	ps += 4;
2094	pd += 4;
2095	pm += 4;
2096	w -= 4;
2097    }
2098
2099    while (w)
2100    {
2101	s = *ps++;
2102	m = *pm++;
2103	d = *pd;
2104
2105	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2106	w--;
2107    }
2108}
2109
2110static force_inline uint32_t
2111core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2112                                         uint32_t mask,
2113                                         uint32_t dst)
2114{
2115    __m128i m = unpack_32_1x128 (mask);
2116    __m128i s = unpack_32_1x128 (src);
2117    __m128i d = unpack_32_1x128 (dst);
2118
2119    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2120    __m128i sa = expand_alpha_1x128 (s);
2121
2122    s = pix_multiply_1x128 (s, m);
2123    m = pix_multiply_1x128 (m, sa);
2124
2125    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2126}
2127
2128static void
2129sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2130                              pixman_op_t              op,
2131                              uint32_t *               pd,
2132                              const uint32_t *         ps,
2133                              const uint32_t *         pm,
2134                              int                      w)
2135{
2136    uint32_t s, m, d;
2137
2138    __m128i xmm_src_lo, xmm_src_hi;
2139    __m128i xmm_dst_lo, xmm_dst_hi;
2140    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2141    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2142    __m128i xmm_mask_lo, xmm_mask_hi;
2143
2144    while (w && (uintptr_t)pd & 15)
2145    {
2146	s = *ps++;
2147	m = *pm++;
2148	d = *pd;
2149
2150	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2151	w--;
2152    }
2153
2154    while (w >= 4)
2155    {
2156	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2157	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2158	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2159
2160	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2161	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2162	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2163
2164	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2165			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2166	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2167			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2168
2169	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2170			    &xmm_mask_lo, &xmm_mask_hi,
2171			    &xmm_src_lo, &xmm_src_hi);
2172	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2173			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2174			    &xmm_mask_lo, &xmm_mask_hi);
2175
2176	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2177		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2178
2179	pix_add_multiply_2x128 (
2180	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2181	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2182	    &xmm_dst_lo, &xmm_dst_hi);
2183
2184	save_128_aligned (
2185	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2186
2187	ps += 4;
2188	pd += 4;
2189	pm += 4;
2190	w -= 4;
2191    }
2192
2193    while (w)
2194    {
2195	s = *ps++;
2196	m = *pm++;
2197	d = *pd;
2198
2199	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2200	w--;
2201    }
2202}
2203
2204static force_inline uint32_t
2205core_combine_xor_ca_pixel_sse2 (uint32_t src,
2206                                uint32_t mask,
2207                                uint32_t dst)
2208{
2209    __m128i a = unpack_32_1x128 (mask);
2210    __m128i s = unpack_32_1x128 (src);
2211    __m128i d = unpack_32_1x128 (dst);
2212
2213    __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2214				       a, expand_alpha_1x128 (s)));
2215    __m128i dest      = pix_multiply_1x128 (s, a);
2216    __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2217
2218    return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2219                                                &alpha_dst,
2220                                                &dest,
2221                                                &alpha_src));
2222}
2223
2224static void
2225sse2_combine_xor_ca (pixman_implementation_t *imp,
2226                     pixman_op_t              op,
2227                     uint32_t *               pd,
2228                     const uint32_t *         ps,
2229                     const uint32_t *         pm,
2230                     int                      w)
2231{
2232    uint32_t s, m, d;
2233
2234    __m128i xmm_src_lo, xmm_src_hi;
2235    __m128i xmm_dst_lo, xmm_dst_hi;
2236    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2237    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2238    __m128i xmm_mask_lo, xmm_mask_hi;
2239
2240    while (w && (uintptr_t)pd & 15)
2241    {
2242	s = *ps++;
2243	m = *pm++;
2244	d = *pd;
2245
2246	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2247	w--;
2248    }
2249
2250    while (w >= 4)
2251    {
2252	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2253	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2254	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2255
2256	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2257	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2258	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2259
2260	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2261			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2262	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2263			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2264
2265	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2266			    &xmm_mask_lo, &xmm_mask_hi,
2267			    &xmm_src_lo, &xmm_src_hi);
2268	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2269			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2270			    &xmm_mask_lo, &xmm_mask_hi);
2271
2272	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2273		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2274	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2275		      &xmm_mask_lo, &xmm_mask_hi);
2276
2277	pix_add_multiply_2x128 (
2278	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2279	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2280	    &xmm_dst_lo, &xmm_dst_hi);
2281
2282	save_128_aligned (
2283	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2284
2285	ps += 4;
2286	pd += 4;
2287	pm += 4;
2288	w -= 4;
2289    }
2290
2291    while (w)
2292    {
2293	s = *ps++;
2294	m = *pm++;
2295	d = *pd;
2296
2297	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2298	w--;
2299    }
2300}
2301
2302static void
2303sse2_combine_add_ca (pixman_implementation_t *imp,
2304                     pixman_op_t              op,
2305                     uint32_t *               pd,
2306                     const uint32_t *         ps,
2307                     const uint32_t *         pm,
2308                     int                      w)
2309{
2310    uint32_t s, m, d;
2311
2312    __m128i xmm_src_lo, xmm_src_hi;
2313    __m128i xmm_dst_lo, xmm_dst_hi;
2314    __m128i xmm_mask_lo, xmm_mask_hi;
2315
2316    while (w && (uintptr_t)pd & 15)
2317    {
2318	s = *ps++;
2319	m = *pm++;
2320	d = *pd;
2321
2322	*pd++ = pack_1x128_32 (
2323	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2324					       unpack_32_1x128 (m)),
2325			   unpack_32_1x128 (d)));
2326	w--;
2327    }
2328
2329    while (w >= 4)
2330    {
2331	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2332	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2333	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2334
2335	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2336	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2337	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2338
2339	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2340			    &xmm_mask_lo, &xmm_mask_hi,
2341			    &xmm_src_lo, &xmm_src_hi);
2342
2343	save_128_aligned (
2344	    (__m128i*)pd, pack_2x128_128 (
2345		_mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2346		_mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2347
2348	ps += 4;
2349	pd += 4;
2350	pm += 4;
2351	w -= 4;
2352    }
2353
2354    while (w)
2355    {
2356	s = *ps++;
2357	m = *pm++;
2358	d = *pd;
2359
2360	*pd++ = pack_1x128_32 (
2361	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2362					       unpack_32_1x128 (m)),
2363			   unpack_32_1x128 (d)));
2364	w--;
2365    }
2366}
2367
2368static force_inline __m128i
2369create_mask_16_128 (uint16_t mask)
2370{
2371    return _mm_set1_epi16 (mask);
2372}
2373
2374/* Work around a code generation bug in Sun Studio 12. */
2375#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2376# define create_mask_2x32_128(mask0, mask1)				\
2377    (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2378#else
2379static force_inline __m128i
2380create_mask_2x32_128 (uint32_t mask0,
2381                      uint32_t mask1)
2382{
2383    return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2384}
2385#endif
2386
2387static void
2388sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2389                            pixman_composite_info_t *info)
2390{
2391    PIXMAN_COMPOSITE_ARGS (info);
2392    uint32_t src;
2393    uint32_t    *dst_line, *dst, d;
2394    int32_t w;
2395    int dst_stride;
2396    __m128i xmm_src, xmm_alpha;
2397    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2398
2399    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2400
2401    if (src == 0)
2402	return;
2403
2404    PIXMAN_IMAGE_GET_LINE (
2405	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2406
2407    xmm_src = expand_pixel_32_1x128 (src);
2408    xmm_alpha = expand_alpha_1x128 (xmm_src);
2409
2410    while (height--)
2411    {
2412	dst = dst_line;
2413
2414	dst_line += dst_stride;
2415	w = width;
2416
2417	while (w && (uintptr_t)dst & 15)
2418	{
2419	    d = *dst;
2420	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2421						xmm_alpha,
2422						unpack_32_1x128 (d)));
2423	    w--;
2424	}
2425
2426	while (w >= 4)
2427	{
2428	    xmm_dst = load_128_aligned ((__m128i*)dst);
2429
2430	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2431
2432	    over_2x128 (&xmm_src, &xmm_src,
2433			&xmm_alpha, &xmm_alpha,
2434			&xmm_dst_lo, &xmm_dst_hi);
2435
2436	    /* rebuid the 4 pixel data and save*/
2437	    save_128_aligned (
2438		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2439
2440	    w -= 4;
2441	    dst += 4;
2442	}
2443
2444	while (w)
2445	{
2446	    d = *dst;
2447	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2448						xmm_alpha,
2449						unpack_32_1x128 (d)));
2450	    w--;
2451	}
2452
2453    }
2454}
2455
2456static void
2457sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2458                            pixman_composite_info_t *info)
2459{
2460    PIXMAN_COMPOSITE_ARGS (info);
2461    uint32_t src;
2462    uint16_t    *dst_line, *dst, d;
2463    int32_t w;
2464    int dst_stride;
2465    __m128i xmm_src, xmm_alpha;
2466    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2467
2468    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2469
2470    if (src == 0)
2471	return;
2472
2473    PIXMAN_IMAGE_GET_LINE (
2474	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2475
2476    xmm_src = expand_pixel_32_1x128 (src);
2477    xmm_alpha = expand_alpha_1x128 (xmm_src);
2478
2479    while (height--)
2480    {
2481	dst = dst_line;
2482
2483	dst_line += dst_stride;
2484	w = width;
2485
2486	while (w && (uintptr_t)dst & 15)
2487	{
2488	    d = *dst;
2489
2490	    *dst++ = pack_565_32_16 (
2491		pack_1x128_32 (over_1x128 (xmm_src,
2492					   xmm_alpha,
2493					   expand565_16_1x128 (d))));
2494	    w--;
2495	}
2496
2497	while (w >= 8)
2498	{
2499	    xmm_dst = load_128_aligned ((__m128i*)dst);
2500
2501	    unpack_565_128_4x128 (xmm_dst,
2502				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2503
2504	    over_2x128 (&xmm_src, &xmm_src,
2505			&xmm_alpha, &xmm_alpha,
2506			&xmm_dst0, &xmm_dst1);
2507	    over_2x128 (&xmm_src, &xmm_src,
2508			&xmm_alpha, &xmm_alpha,
2509			&xmm_dst2, &xmm_dst3);
2510
2511	    xmm_dst = pack_565_4x128_128 (
2512		&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2513
2514	    save_128_aligned ((__m128i*)dst, xmm_dst);
2515
2516	    dst += 8;
2517	    w -= 8;
2518	}
2519
2520	while (w--)
2521	{
2522	    d = *dst;
2523	    *dst++ = pack_565_32_16 (
2524		pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2525					   expand565_16_1x128 (d))));
2526	}
2527    }
2528
2529}
2530
2531static void
2532sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2533				   pixman_composite_info_t *info)
2534{
2535    PIXMAN_COMPOSITE_ARGS (info);
2536    uint32_t src;
2537    uint32_t    *dst_line, d;
2538    uint32_t    *mask_line, m;
2539    uint32_t pack_cmp;
2540    int dst_stride, mask_stride;
2541
2542    __m128i xmm_src;
2543    __m128i xmm_dst;
2544    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2545
2546    __m128i mmx_src, mmx_mask, mmx_dest;
2547
2548    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2549
2550    if (src == 0)
2551	return;
2552
2553    PIXMAN_IMAGE_GET_LINE (
2554	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2555    PIXMAN_IMAGE_GET_LINE (
2556	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2557
2558    xmm_src = _mm_unpacklo_epi8 (
2559	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2560    mmx_src   = xmm_src;
2561
2562    while (height--)
2563    {
2564	int w = width;
2565	const uint32_t *pm = (uint32_t *)mask_line;
2566	uint32_t *pd = (uint32_t *)dst_line;
2567
2568	dst_line += dst_stride;
2569	mask_line += mask_stride;
2570
2571	while (w && (uintptr_t)pd & 15)
2572	{
2573	    m = *pm++;
2574
2575	    if (m)
2576	    {
2577		d = *pd;
2578
2579		mmx_mask = unpack_32_1x128 (m);
2580		mmx_dest = unpack_32_1x128 (d);
2581
2582		*pd = pack_1x128_32 (
2583		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2584				   mmx_dest));
2585	    }
2586
2587	    pd++;
2588	    w--;
2589	}
2590
2591	while (w >= 4)
2592	{
2593	    xmm_mask = load_128_unaligned ((__m128i*)pm);
2594
2595	    pack_cmp =
2596		_mm_movemask_epi8 (
2597		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2598
2599	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2600	    if (pack_cmp != 0xffff)
2601	    {
2602		xmm_dst = load_128_aligned ((__m128i*)pd);
2603
2604		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2605
2606		pix_multiply_2x128 (&xmm_src, &xmm_src,
2607				    &xmm_mask_lo, &xmm_mask_hi,
2608				    &xmm_mask_lo, &xmm_mask_hi);
2609		xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2610
2611		save_128_aligned (
2612		    (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2613	    }
2614
2615	    pd += 4;
2616	    pm += 4;
2617	    w -= 4;
2618	}
2619
2620	while (w)
2621	{
2622	    m = *pm++;
2623
2624	    if (m)
2625	    {
2626		d = *pd;
2627
2628		mmx_mask = unpack_32_1x128 (m);
2629		mmx_dest = unpack_32_1x128 (d);
2630
2631		*pd = pack_1x128_32 (
2632		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2633				   mmx_dest));
2634	    }
2635
2636	    pd++;
2637	    w--;
2638	}
2639    }
2640
2641}
2642
2643static void
2644sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2645                                    pixman_composite_info_t *info)
2646{
2647    PIXMAN_COMPOSITE_ARGS (info);
2648    uint32_t src;
2649    uint32_t    *dst_line, d;
2650    uint32_t    *mask_line, m;
2651    uint32_t pack_cmp;
2652    int dst_stride, mask_stride;
2653
2654    __m128i xmm_src, xmm_alpha;
2655    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2656    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2657
2658    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2659
2660    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2661
2662    if (src == 0)
2663	return;
2664
2665    PIXMAN_IMAGE_GET_LINE (
2666	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2667    PIXMAN_IMAGE_GET_LINE (
2668	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2669
2670    xmm_src = _mm_unpacklo_epi8 (
2671	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2672    xmm_alpha = expand_alpha_1x128 (xmm_src);
2673    mmx_src   = xmm_src;
2674    mmx_alpha = xmm_alpha;
2675
2676    while (height--)
2677    {
2678	int w = width;
2679	const uint32_t *pm = (uint32_t *)mask_line;
2680	uint32_t *pd = (uint32_t *)dst_line;
2681
2682	dst_line += dst_stride;
2683	mask_line += mask_stride;
2684
2685	while (w && (uintptr_t)pd & 15)
2686	{
2687	    m = *pm++;
2688
2689	    if (m)
2690	    {
2691		d = *pd;
2692		mmx_mask = unpack_32_1x128 (m);
2693		mmx_dest = unpack_32_1x128 (d);
2694
2695		*pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2696		                                  &mmx_alpha,
2697		                                  &mmx_mask,
2698		                                  &mmx_dest));
2699	    }
2700
2701	    pd++;
2702	    w--;
2703	}
2704
2705	while (w >= 4)
2706	{
2707	    xmm_mask = load_128_unaligned ((__m128i*)pm);
2708
2709	    pack_cmp =
2710		_mm_movemask_epi8 (
2711		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2712
2713	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2714	    if (pack_cmp != 0xffff)
2715	    {
2716		xmm_dst = load_128_aligned ((__m128i*)pd);
2717
2718		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2719		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2720
2721		in_over_2x128 (&xmm_src, &xmm_src,
2722			       &xmm_alpha, &xmm_alpha,
2723			       &xmm_mask_lo, &xmm_mask_hi,
2724			       &xmm_dst_lo, &xmm_dst_hi);
2725
2726		save_128_aligned (
2727		    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2728	    }
2729
2730	    pd += 4;
2731	    pm += 4;
2732	    w -= 4;
2733	}
2734
2735	while (w)
2736	{
2737	    m = *pm++;
2738
2739	    if (m)
2740	    {
2741		d = *pd;
2742		mmx_mask = unpack_32_1x128 (m);
2743		mmx_dest = unpack_32_1x128 (d);
2744
2745		*pd = pack_1x128_32 (
2746		    in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2747	    }
2748
2749	    pd++;
2750	    w--;
2751	}
2752    }
2753
2754}
2755
2756static void
2757sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2758                                 pixman_composite_info_t *info)
2759{
2760    PIXMAN_COMPOSITE_ARGS (info);
2761    uint32_t    *dst_line, *dst;
2762    uint32_t    *src_line, *src;
2763    uint32_t mask;
2764    int32_t w;
2765    int dst_stride, src_stride;
2766
2767    __m128i xmm_mask;
2768    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2769    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2770    __m128i xmm_alpha_lo, xmm_alpha_hi;
2771
2772    PIXMAN_IMAGE_GET_LINE (
2773	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2774    PIXMAN_IMAGE_GET_LINE (
2775	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2776
2777    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2778
2779    xmm_mask = create_mask_16_128 (mask >> 24);
2780
2781    while (height--)
2782    {
2783	dst = dst_line;
2784	dst_line += dst_stride;
2785	src = src_line;
2786	src_line += src_stride;
2787	w = width;
2788
2789	while (w && (uintptr_t)dst & 15)
2790	{
2791	    uint32_t s = *src++;
2792
2793	    if (s)
2794	    {
2795		uint32_t d = *dst;
2796
2797		__m128i ms = unpack_32_1x128 (s);
2798		__m128i alpha    = expand_alpha_1x128 (ms);
2799		__m128i dest     = xmm_mask;
2800		__m128i alpha_dst = unpack_32_1x128 (d);
2801
2802		*dst = pack_1x128_32 (
2803		    in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2804	    }
2805	    dst++;
2806	    w--;
2807	}
2808
2809	while (w >= 4)
2810	{
2811	    xmm_src = load_128_unaligned ((__m128i*)src);
2812
2813	    if (!is_zero (xmm_src))
2814	    {
2815		xmm_dst = load_128_aligned ((__m128i*)dst);
2816
2817		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2818		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2819		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2820				    &xmm_alpha_lo, &xmm_alpha_hi);
2821
2822		in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2823			       &xmm_alpha_lo, &xmm_alpha_hi,
2824			       &xmm_mask, &xmm_mask,
2825			       &xmm_dst_lo, &xmm_dst_hi);
2826
2827		save_128_aligned (
2828		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2829	    }
2830
2831	    dst += 4;
2832	    src += 4;
2833	    w -= 4;
2834	}
2835
2836	while (w)
2837	{
2838	    uint32_t s = *src++;
2839
2840	    if (s)
2841	    {
2842		uint32_t d = *dst;
2843
2844		__m128i ms = unpack_32_1x128 (s);
2845		__m128i alpha = expand_alpha_1x128 (ms);
2846		__m128i mask  = xmm_mask;
2847		__m128i dest  = unpack_32_1x128 (d);
2848
2849		*dst = pack_1x128_32 (
2850		    in_over_1x128 (&ms, &alpha, &mask, &dest));
2851	    }
2852
2853	    dst++;
2854	    w--;
2855	}
2856    }
2857
2858}
2859
2860static void
2861sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
2862                              pixman_composite_info_t *info)
2863{
2864    PIXMAN_COMPOSITE_ARGS (info);
2865    uint16_t    *dst_line, *dst;
2866    uint32_t    *src_line, *src, s;
2867    int dst_stride, src_stride;
2868    int32_t w;
2869
2870    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2871    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2872
2873    while (height--)
2874    {
2875	dst = dst_line;
2876	dst_line += dst_stride;
2877	src = src_line;
2878	src_line += src_stride;
2879	w = width;
2880
2881	while (w && (uintptr_t)dst & 15)
2882	{
2883	    s = *src++;
2884	    *dst = convert_8888_to_0565 (s);
2885	    dst++;
2886	    w--;
2887	}
2888
2889	while (w >= 8)
2890	{
2891	    __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
2892	    __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
2893
2894	    save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
2895
2896	    w -= 8;
2897	    src += 8;
2898	    dst += 8;
2899	}
2900
2901	while (w)
2902	{
2903	    s = *src++;
2904	    *dst = convert_8888_to_0565 (s);
2905	    dst++;
2906	    w--;
2907	}
2908    }
2909}
2910
2911static void
2912sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
2913			      pixman_composite_info_t *info)
2914{
2915    PIXMAN_COMPOSITE_ARGS (info);
2916    uint32_t    *dst_line, *dst;
2917    uint32_t    *src_line, *src;
2918    int32_t w;
2919    int dst_stride, src_stride;
2920
2921
2922    PIXMAN_IMAGE_GET_LINE (
2923	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2924    PIXMAN_IMAGE_GET_LINE (
2925	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2926
2927    while (height--)
2928    {
2929	dst = dst_line;
2930	dst_line += dst_stride;
2931	src = src_line;
2932	src_line += src_stride;
2933	w = width;
2934
2935	while (w && (uintptr_t)dst & 15)
2936	{
2937	    *dst++ = *src++ | 0xff000000;
2938	    w--;
2939	}
2940
2941	while (w >= 16)
2942	{
2943	    __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2944
2945	    xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2946	    xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2947	    xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2948	    xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2949
2950	    save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2951	    save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2952	    save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2953	    save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2954
2955	    dst += 16;
2956	    src += 16;
2957	    w -= 16;
2958	}
2959
2960	while (w)
2961	{
2962	    *dst++ = *src++ | 0xff000000;
2963	    w--;
2964	}
2965    }
2966
2967}
2968
2969static void
2970sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
2971                                 pixman_composite_info_t *info)
2972{
2973    PIXMAN_COMPOSITE_ARGS (info);
2974    uint32_t    *dst_line, *dst;
2975    uint32_t    *src_line, *src;
2976    uint32_t mask;
2977    int dst_stride, src_stride;
2978    int32_t w;
2979
2980    __m128i xmm_mask, xmm_alpha;
2981    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2982    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2983
2984    PIXMAN_IMAGE_GET_LINE (
2985	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2986    PIXMAN_IMAGE_GET_LINE (
2987	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2988
2989    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2990
2991    xmm_mask = create_mask_16_128 (mask >> 24);
2992    xmm_alpha = mask_00ff;
2993
2994    while (height--)
2995    {
2996	dst = dst_line;
2997	dst_line += dst_stride;
2998	src = src_line;
2999	src_line += src_stride;
3000	w = width;
3001
3002	while (w && (uintptr_t)dst & 15)
3003	{
3004	    uint32_t s = (*src++) | 0xff000000;
3005	    uint32_t d = *dst;
3006
3007	    __m128i src   = unpack_32_1x128 (s);
3008	    __m128i alpha = xmm_alpha;
3009	    __m128i mask  = xmm_mask;
3010	    __m128i dest  = unpack_32_1x128 (d);
3011
3012	    *dst++ = pack_1x128_32 (
3013		in_over_1x128 (&src, &alpha, &mask, &dest));
3014
3015	    w--;
3016	}
3017
3018	while (w >= 4)
3019	{
3020	    xmm_src = _mm_or_si128 (
3021		load_128_unaligned ((__m128i*)src), mask_ff000000);
3022	    xmm_dst = load_128_aligned ((__m128i*)dst);
3023
3024	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3025	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3026
3027	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3028			   &xmm_alpha, &xmm_alpha,
3029			   &xmm_mask, &xmm_mask,
3030			   &xmm_dst_lo, &xmm_dst_hi);
3031
3032	    save_128_aligned (
3033		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3034
3035	    dst += 4;
3036	    src += 4;
3037	    w -= 4;
3038
3039	}
3040
3041	while (w)
3042	{
3043	    uint32_t s = (*src++) | 0xff000000;
3044	    uint32_t d = *dst;
3045
3046	    __m128i src  = unpack_32_1x128 (s);
3047	    __m128i alpha = xmm_alpha;
3048	    __m128i mask  = xmm_mask;
3049	    __m128i dest  = unpack_32_1x128 (d);
3050
3051	    *dst++ = pack_1x128_32 (
3052		in_over_1x128 (&src, &alpha, &mask, &dest));
3053
3054	    w--;
3055	}
3056    }
3057
3058}
3059
3060static void
3061sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3062                               pixman_composite_info_t *info)
3063{
3064    PIXMAN_COMPOSITE_ARGS (info);
3065    int dst_stride, src_stride;
3066    uint32_t    *dst_line, *dst;
3067    uint32_t    *src_line, *src;
3068
3069    PIXMAN_IMAGE_GET_LINE (
3070	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3071    PIXMAN_IMAGE_GET_LINE (
3072	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3073
3074    dst = dst_line;
3075    src = src_line;
3076
3077    while (height--)
3078    {
3079	sse2_combine_over_u (imp, op, dst, src, NULL, width);
3080
3081	dst += dst_stride;
3082	src += src_stride;
3083    }
3084}
3085
3086static force_inline uint16_t
3087composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3088{
3089    __m128i ms;
3090
3091    ms = unpack_32_1x128 (src);
3092    return pack_565_32_16 (
3093	pack_1x128_32 (
3094	    over_1x128 (
3095		ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3096}
3097
3098static void
3099sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3100                               pixman_composite_info_t *info)
3101{
3102    PIXMAN_COMPOSITE_ARGS (info);
3103    uint16_t    *dst_line, *dst, d;
3104    uint32_t    *src_line, *src, s;
3105    int dst_stride, src_stride;
3106    int32_t w;
3107
3108    __m128i xmm_alpha_lo, xmm_alpha_hi;
3109    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3110    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3111
3112    PIXMAN_IMAGE_GET_LINE (
3113	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3114    PIXMAN_IMAGE_GET_LINE (
3115	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3116
3117    while (height--)
3118    {
3119	dst = dst_line;
3120	src = src_line;
3121
3122	dst_line += dst_stride;
3123	src_line += src_stride;
3124	w = width;
3125
3126	/* Align dst on a 16-byte boundary */
3127	while (w &&
3128	       ((uintptr_t)dst & 15))
3129	{
3130	    s = *src++;
3131	    d = *dst;
3132
3133	    *dst++ = composite_over_8888_0565pixel (s, d);
3134	    w--;
3135	}
3136
3137	/* It's a 8 pixel loop */
3138	while (w >= 8)
3139	{
3140	    /* I'm loading unaligned because I'm not sure
3141	     * about the address alignment.
3142	     */
3143	    xmm_src = load_128_unaligned ((__m128i*) src);
3144	    xmm_dst = load_128_aligned ((__m128i*) dst);
3145
3146	    /* Unpacking */
3147	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3148	    unpack_565_128_4x128 (xmm_dst,
3149				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3150	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3151				&xmm_alpha_lo, &xmm_alpha_hi);
3152
3153	    /* I'm loading next 4 pixels from memory
3154	     * before to optimze the memory read.
3155	     */
3156	    xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3157
3158	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
3159			&xmm_alpha_lo, &xmm_alpha_hi,
3160			&xmm_dst0, &xmm_dst1);
3161
3162	    /* Unpacking */
3163	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3164	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3165				&xmm_alpha_lo, &xmm_alpha_hi);
3166
3167	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
3168			&xmm_alpha_lo, &xmm_alpha_hi,
3169			&xmm_dst2, &xmm_dst3);
3170
3171	    save_128_aligned (
3172		(__m128i*)dst, pack_565_4x128_128 (
3173		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3174
3175	    w -= 8;
3176	    dst += 8;
3177	    src += 8;
3178	}
3179
3180	while (w--)
3181	{
3182	    s = *src++;
3183	    d = *dst;
3184
3185	    *dst++ = composite_over_8888_0565pixel (s, d);
3186	}
3187    }
3188
3189}
3190
3191static void
3192sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3193                              pixman_composite_info_t *info)
3194{
3195    PIXMAN_COMPOSITE_ARGS (info);
3196    uint32_t src, srca;
3197    uint32_t *dst_line, *dst;
3198    uint8_t *mask_line, *mask;
3199    int dst_stride, mask_stride;
3200    int32_t w;
3201    uint32_t m, d;
3202
3203    __m128i xmm_src, xmm_alpha, xmm_def;
3204    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3205    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3206
3207    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3208
3209    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3210
3211    srca = src >> 24;
3212    if (src == 0)
3213	return;
3214
3215    PIXMAN_IMAGE_GET_LINE (
3216	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3217    PIXMAN_IMAGE_GET_LINE (
3218	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3219
3220    xmm_def = create_mask_2x32_128 (src, src);
3221    xmm_src = expand_pixel_32_1x128 (src);
3222    xmm_alpha = expand_alpha_1x128 (xmm_src);
3223    mmx_src   = xmm_src;
3224    mmx_alpha = xmm_alpha;
3225
3226    while (height--)
3227    {
3228	dst = dst_line;
3229	dst_line += dst_stride;
3230	mask = mask_line;
3231	mask_line += mask_stride;
3232	w = width;
3233
3234	while (w && (uintptr_t)dst & 15)
3235	{
3236	    uint8_t m = *mask++;
3237
3238	    if (m)
3239	    {
3240		d = *dst;
3241		mmx_mask = expand_pixel_8_1x128 (m);
3242		mmx_dest = unpack_32_1x128 (d);
3243
3244		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3245		                                   &mmx_alpha,
3246		                                   &mmx_mask,
3247		                                   &mmx_dest));
3248	    }
3249
3250	    w--;
3251	    dst++;
3252	}
3253
3254	while (w >= 4)
3255	{
3256	    m = *((uint32_t*)mask);
3257
3258	    if (srca == 0xff && m == 0xffffffff)
3259	    {
3260		save_128_aligned ((__m128i*)dst, xmm_def);
3261	    }
3262	    else if (m)
3263	    {
3264		xmm_dst = load_128_aligned ((__m128i*) dst);
3265		xmm_mask = unpack_32_1x128 (m);
3266		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3267
3268		/* Unpacking */
3269		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3270		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3271
3272		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3273					&xmm_mask_lo, &xmm_mask_hi);
3274
3275		in_over_2x128 (&xmm_src, &xmm_src,
3276			       &xmm_alpha, &xmm_alpha,
3277			       &xmm_mask_lo, &xmm_mask_hi,
3278			       &xmm_dst_lo, &xmm_dst_hi);
3279
3280		save_128_aligned (
3281		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3282	    }
3283
3284	    w -= 4;
3285	    dst += 4;
3286	    mask += 4;
3287	}
3288
3289	while (w)
3290	{
3291	    uint8_t m = *mask++;
3292
3293	    if (m)
3294	    {
3295		d = *dst;
3296		mmx_mask = expand_pixel_8_1x128 (m);
3297		mmx_dest = unpack_32_1x128 (d);
3298
3299		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3300		                                   &mmx_alpha,
3301		                                   &mmx_mask,
3302		                                   &mmx_dest));
3303	    }
3304
3305	    w--;
3306	    dst++;
3307	}
3308    }
3309
3310}
3311
3312#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
3313__attribute__((__force_align_arg_pointer__))
3314#endif
3315static pixman_bool_t
3316sse2_fill (pixman_implementation_t *imp,
3317           uint32_t *               bits,
3318           int                      stride,
3319           int                      bpp,
3320           int                      x,
3321           int                      y,
3322           int                      width,
3323           int                      height,
3324           uint32_t		    filler)
3325{
3326    uint32_t byte_width;
3327    uint8_t *byte_line;
3328
3329    __m128i xmm_def;
3330
3331    if (bpp == 8)
3332    {
3333	uint8_t b;
3334	uint16_t w;
3335
3336	stride = stride * (int) sizeof (uint32_t) / 1;
3337	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3338	byte_width = width;
3339	stride *= 1;
3340
3341	b = filler & 0xff;
3342	w = (b << 8) | b;
3343	filler = (w << 16) | w;
3344    }
3345    else if (bpp == 16)
3346    {
3347	stride = stride * (int) sizeof (uint32_t) / 2;
3348	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3349	byte_width = 2 * width;
3350	stride *= 2;
3351
3352        filler = (filler & 0xffff) * 0x00010001;
3353    }
3354    else if (bpp == 32)
3355    {
3356	stride = stride * (int) sizeof (uint32_t) / 4;
3357	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3358	byte_width = 4 * width;
3359	stride *= 4;
3360    }
3361    else
3362    {
3363	return FALSE;
3364    }
3365
3366    xmm_def = create_mask_2x32_128 (filler, filler);
3367
3368    while (height--)
3369    {
3370	int w;
3371	uint8_t *d = byte_line;
3372	byte_line += stride;
3373	w = byte_width;
3374
3375	if (w >= 1 && ((uintptr_t)d & 1))
3376	{
3377	    *(uint8_t *)d = filler;
3378	    w -= 1;
3379	    d += 1;
3380	}
3381
3382	while (w >= 2 && ((uintptr_t)d & 3))
3383	{
3384	    *(uint16_t *)d = filler;
3385	    w -= 2;
3386	    d += 2;
3387	}
3388
3389	while (w >= 4 && ((uintptr_t)d & 15))
3390	{
3391	    *(uint32_t *)d = filler;
3392
3393	    w -= 4;
3394	    d += 4;
3395	}
3396
3397	while (w >= 128)
3398	{
3399	    save_128_aligned ((__m128i*)(d),     xmm_def);
3400	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3401	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3402	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3403	    save_128_aligned ((__m128i*)(d + 64),  xmm_def);
3404	    save_128_aligned ((__m128i*)(d + 80),  xmm_def);
3405	    save_128_aligned ((__m128i*)(d + 96),  xmm_def);
3406	    save_128_aligned ((__m128i*)(d + 112), xmm_def);
3407
3408	    d += 128;
3409	    w -= 128;
3410	}
3411
3412	if (w >= 64)
3413	{
3414	    save_128_aligned ((__m128i*)(d),     xmm_def);
3415	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3416	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3417	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3418
3419	    d += 64;
3420	    w -= 64;
3421	}
3422
3423	if (w >= 32)
3424	{
3425	    save_128_aligned ((__m128i*)(d),     xmm_def);
3426	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3427
3428	    d += 32;
3429	    w -= 32;
3430	}
3431
3432	if (w >= 16)
3433	{
3434	    save_128_aligned ((__m128i*)(d),     xmm_def);
3435
3436	    d += 16;
3437	    w -= 16;
3438	}
3439
3440	while (w >= 4)
3441	{
3442	    *(uint32_t *)d = filler;
3443
3444	    w -= 4;
3445	    d += 4;
3446	}
3447
3448	if (w >= 2)
3449	{
3450	    *(uint16_t *)d = filler;
3451	    w -= 2;
3452	    d += 2;
3453	}
3454
3455	if (w >= 1)
3456	{
3457	    *(uint8_t *)d = filler;
3458	    w -= 1;
3459	    d += 1;
3460	}
3461    }
3462
3463    return TRUE;
3464}
3465
3466static void
3467sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3468                             pixman_composite_info_t *info)
3469{
3470    PIXMAN_COMPOSITE_ARGS (info);
3471    uint32_t src, srca;
3472    uint32_t    *dst_line, *dst;
3473    uint8_t     *mask_line, *mask;
3474    int dst_stride, mask_stride;
3475    int32_t w;
3476    uint32_t m;
3477
3478    __m128i xmm_src, xmm_def;
3479    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3480
3481    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3482
3483    srca = src >> 24;
3484    if (src == 0)
3485    {
3486	sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
3487		   PIXMAN_FORMAT_BPP (dest_image->bits.format),
3488		   dest_x, dest_y, width, height, 0);
3489	return;
3490    }
3491
3492    PIXMAN_IMAGE_GET_LINE (
3493	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3494    PIXMAN_IMAGE_GET_LINE (
3495	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3496
3497    xmm_def = create_mask_2x32_128 (src, src);
3498    xmm_src = expand_pixel_32_1x128 (src);
3499
3500    while (height--)
3501    {
3502	dst = dst_line;
3503	dst_line += dst_stride;
3504	mask = mask_line;
3505	mask_line += mask_stride;
3506	w = width;
3507
3508	while (w && (uintptr_t)dst & 15)
3509	{
3510	    uint8_t m = *mask++;
3511
3512	    if (m)
3513	    {
3514		*dst = pack_1x128_32 (
3515		    pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
3516	    }
3517	    else
3518	    {
3519		*dst = 0;
3520	    }
3521
3522	    w--;
3523	    dst++;
3524	}
3525
3526	while (w >= 4)
3527	{
3528	    m = *((uint32_t*)mask);
3529
3530	    if (srca == 0xff && m == 0xffffffff)
3531	    {
3532		save_128_aligned ((__m128i*)dst, xmm_def);
3533	    }
3534	    else if (m)
3535	    {
3536		xmm_mask = unpack_32_1x128 (m);
3537		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3538
3539		/* Unpacking */
3540		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3541
3542		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3543					&xmm_mask_lo, &xmm_mask_hi);
3544
3545		pix_multiply_2x128 (&xmm_src, &xmm_src,
3546				    &xmm_mask_lo, &xmm_mask_hi,
3547				    &xmm_mask_lo, &xmm_mask_hi);
3548
3549		save_128_aligned (
3550		    (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3551	    }
3552	    else
3553	    {
3554		save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3555	    }
3556
3557	    w -= 4;
3558	    dst += 4;
3559	    mask += 4;
3560	}
3561
3562	while (w)
3563	{
3564	    uint8_t m = *mask++;
3565
3566	    if (m)
3567	    {
3568		*dst = pack_1x128_32 (
3569		    pix_multiply_1x128 (
3570			xmm_src, expand_pixel_8_1x128 (m)));
3571	    }
3572	    else
3573	    {
3574		*dst = 0;
3575	    }
3576
3577	    w--;
3578	    dst++;
3579	}
3580    }
3581
3582}
3583
3584static void
3585sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3586                              pixman_composite_info_t *info)
3587{
3588    PIXMAN_COMPOSITE_ARGS (info);
3589    uint32_t src;
3590    uint16_t    *dst_line, *dst, d;
3591    uint8_t     *mask_line, *mask;
3592    int dst_stride, mask_stride;
3593    int32_t w;
3594    uint32_t m;
3595    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3596
3597    __m128i xmm_src, xmm_alpha;
3598    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3599    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3600
3601    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3602
3603    if (src == 0)
3604	return;
3605
3606    PIXMAN_IMAGE_GET_LINE (
3607	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3608    PIXMAN_IMAGE_GET_LINE (
3609	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3610
3611    xmm_src = expand_pixel_32_1x128 (src);
3612    xmm_alpha = expand_alpha_1x128 (xmm_src);
3613    mmx_src = xmm_src;
3614    mmx_alpha = xmm_alpha;
3615
3616    while (height--)
3617    {
3618	dst = dst_line;
3619	dst_line += dst_stride;
3620	mask = mask_line;
3621	mask_line += mask_stride;
3622	w = width;
3623
3624	while (w && (uintptr_t)dst & 15)
3625	{
3626	    m = *mask++;
3627
3628	    if (m)
3629	    {
3630		d = *dst;
3631		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3632		mmx_dest = expand565_16_1x128 (d);
3633
3634		*dst = pack_565_32_16 (
3635		    pack_1x128_32 (
3636			in_over_1x128 (
3637			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3638	    }
3639
3640	    w--;
3641	    dst++;
3642	}
3643
3644	while (w >= 8)
3645	{
3646	    xmm_dst = load_128_aligned ((__m128i*) dst);
3647	    unpack_565_128_4x128 (xmm_dst,
3648				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3649
3650	    m = *((uint32_t*)mask);
3651	    mask += 4;
3652
3653	    if (m)
3654	    {
3655		xmm_mask = unpack_32_1x128 (m);
3656		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3657
3658		/* Unpacking */
3659		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3660
3661		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3662					&xmm_mask_lo, &xmm_mask_hi);
3663
3664		in_over_2x128 (&xmm_src, &xmm_src,
3665			       &xmm_alpha, &xmm_alpha,
3666			       &xmm_mask_lo, &xmm_mask_hi,
3667			       &xmm_dst0, &xmm_dst1);
3668	    }
3669
3670	    m = *((uint32_t*)mask);
3671	    mask += 4;
3672
3673	    if (m)
3674	    {
3675		xmm_mask = unpack_32_1x128 (m);
3676		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3677
3678		/* Unpacking */
3679		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3680
3681		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3682					&xmm_mask_lo, &xmm_mask_hi);
3683		in_over_2x128 (&xmm_src, &xmm_src,
3684			       &xmm_alpha, &xmm_alpha,
3685			       &xmm_mask_lo, &xmm_mask_hi,
3686			       &xmm_dst2, &xmm_dst3);
3687	    }
3688
3689	    save_128_aligned (
3690		(__m128i*)dst, pack_565_4x128_128 (
3691		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3692
3693	    w -= 8;
3694	    dst += 8;
3695	}
3696
3697	while (w)
3698	{
3699	    m = *mask++;
3700
3701	    if (m)
3702	    {
3703		d = *dst;
3704		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3705		mmx_dest = expand565_16_1x128 (d);
3706
3707		*dst = pack_565_32_16 (
3708		    pack_1x128_32 (
3709			in_over_1x128 (
3710			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3711	    }
3712
3713	    w--;
3714	    dst++;
3715	}
3716    }
3717
3718}
3719
3720static void
3721sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3722                                 pixman_composite_info_t *info)
3723{
3724    PIXMAN_COMPOSITE_ARGS (info);
3725    uint16_t    *dst_line, *dst, d;
3726    uint32_t    *src_line, *src, s;
3727    int dst_stride, src_stride;
3728    int32_t w;
3729    uint32_t opaque, zero;
3730
3731    __m128i ms;
3732    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3733    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3734
3735    PIXMAN_IMAGE_GET_LINE (
3736	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3737    PIXMAN_IMAGE_GET_LINE (
3738	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3739
3740    while (height--)
3741    {
3742	dst = dst_line;
3743	dst_line += dst_stride;
3744	src = src_line;
3745	src_line += src_stride;
3746	w = width;
3747
3748	while (w && (uintptr_t)dst & 15)
3749	{
3750	    s = *src++;
3751	    d = *dst;
3752
3753	    ms = unpack_32_1x128 (s);
3754
3755	    *dst++ = pack_565_32_16 (
3756		pack_1x128_32 (
3757		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3758	    w--;
3759	}
3760
3761	while (w >= 8)
3762	{
3763	    /* First round */
3764	    xmm_src = load_128_unaligned ((__m128i*)src);
3765	    xmm_dst = load_128_aligned  ((__m128i*)dst);
3766
3767	    opaque = is_opaque (xmm_src);
3768	    zero = is_zero (xmm_src);
3769
3770	    unpack_565_128_4x128 (xmm_dst,
3771				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3772	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3773
3774	    /* preload next round*/
3775	    xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3776
3777	    if (opaque)
3778	    {
3779		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3780				     &xmm_dst0, &xmm_dst1);
3781	    }
3782	    else if (!zero)
3783	    {
3784		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3785					&xmm_dst0, &xmm_dst1);
3786	    }
3787
3788	    /* Second round */
3789	    opaque = is_opaque (xmm_src);
3790	    zero = is_zero (xmm_src);
3791
3792	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3793
3794	    if (opaque)
3795	    {
3796		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3797				     &xmm_dst2, &xmm_dst3);
3798	    }
3799	    else if (!zero)
3800	    {
3801		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3802					&xmm_dst2, &xmm_dst3);
3803	    }
3804
3805	    save_128_aligned (
3806		(__m128i*)dst, pack_565_4x128_128 (
3807		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3808
3809	    w -= 8;
3810	    src += 8;
3811	    dst += 8;
3812	}
3813
3814	while (w)
3815	{
3816	    s = *src++;
3817	    d = *dst;
3818
3819	    ms = unpack_32_1x128 (s);
3820
3821	    *dst++ = pack_565_32_16 (
3822		pack_1x128_32 (
3823		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3824	    w--;
3825	}
3826    }
3827
3828}
3829
3830static void
3831sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3832                                 pixman_composite_info_t *info)
3833{
3834    PIXMAN_COMPOSITE_ARGS (info);
3835    uint32_t    *dst_line, *dst, d;
3836    uint32_t    *src_line, *src, s;
3837    int dst_stride, src_stride;
3838    int32_t w;
3839    uint32_t opaque, zero;
3840
3841    __m128i xmm_src_lo, xmm_src_hi;
3842    __m128i xmm_dst_lo, xmm_dst_hi;
3843
3844    PIXMAN_IMAGE_GET_LINE (
3845	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3846    PIXMAN_IMAGE_GET_LINE (
3847	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3848
3849    while (height--)
3850    {
3851	dst = dst_line;
3852	dst_line += dst_stride;
3853	src = src_line;
3854	src_line += src_stride;
3855	w = width;
3856
3857	while (w && (uintptr_t)dst & 15)
3858	{
3859	    s = *src++;
3860	    d = *dst;
3861
3862	    *dst++ = pack_1x128_32 (
3863		over_rev_non_pre_1x128 (
3864		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
3865
3866	    w--;
3867	}
3868
3869	while (w >= 4)
3870	{
3871	    xmm_src_hi = load_128_unaligned ((__m128i*)src);
3872
3873	    opaque = is_opaque (xmm_src_hi);
3874	    zero = is_zero (xmm_src_hi);
3875
3876	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
3877
3878	    if (opaque)
3879	    {
3880		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3881				     &xmm_dst_lo, &xmm_dst_hi);
3882
3883		save_128_aligned (
3884		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3885	    }
3886	    else if (!zero)
3887	    {
3888		xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
3889
3890		unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
3891
3892		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3893					&xmm_dst_lo, &xmm_dst_hi);
3894
3895		save_128_aligned (
3896		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3897	    }
3898
3899	    w -= 4;
3900	    dst += 4;
3901	    src += 4;
3902	}
3903
3904	while (w)
3905	{
3906	    s = *src++;
3907	    d = *dst;
3908
3909	    *dst++ = pack_1x128_32 (
3910		over_rev_non_pre_1x128 (
3911		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
3912
3913	    w--;
3914	}
3915    }
3916
3917}
3918
3919static void
3920sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
3921                                    pixman_composite_info_t *info)
3922{
3923    PIXMAN_COMPOSITE_ARGS (info);
3924    uint32_t src;
3925    uint16_t    *dst_line, *dst, d;
3926    uint32_t    *mask_line, *mask, m;
3927    int dst_stride, mask_stride;
3928    int w;
3929    uint32_t pack_cmp;
3930
3931    __m128i xmm_src, xmm_alpha;
3932    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3933    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3934
3935    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3936
3937    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3938
3939    if (src == 0)
3940	return;
3941
3942    PIXMAN_IMAGE_GET_LINE (
3943	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3944    PIXMAN_IMAGE_GET_LINE (
3945	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3946
3947    xmm_src = expand_pixel_32_1x128 (src);
3948    xmm_alpha = expand_alpha_1x128 (xmm_src);
3949    mmx_src = xmm_src;
3950    mmx_alpha = xmm_alpha;
3951
3952    while (height--)
3953    {
3954	w = width;
3955	mask = mask_line;
3956	dst = dst_line;
3957	mask_line += mask_stride;
3958	dst_line += dst_stride;
3959
3960	while (w && ((uintptr_t)dst & 15))
3961	{
3962	    m = *(uint32_t *) mask;
3963
3964	    if (m)
3965	    {
3966		d = *dst;
3967		mmx_mask = unpack_32_1x128 (m);
3968		mmx_dest = expand565_16_1x128 (d);
3969
3970		*dst = pack_565_32_16 (
3971		    pack_1x128_32 (
3972			in_over_1x128 (
3973			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3974	    }
3975
3976	    w--;
3977	    dst++;
3978	    mask++;
3979	}
3980
3981	while (w >= 8)
3982	{
3983	    /* First round */
3984	    xmm_mask = load_128_unaligned ((__m128i*)mask);
3985	    xmm_dst = load_128_aligned ((__m128i*)dst);
3986
3987	    pack_cmp = _mm_movemask_epi8 (
3988		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3989
3990	    unpack_565_128_4x128 (xmm_dst,
3991				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3992	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3993
3994	    /* preload next round */
3995	    xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
3996
3997	    /* preload next round */
3998	    if (pack_cmp != 0xffff)
3999	    {
4000		in_over_2x128 (&xmm_src, &xmm_src,
4001			       &xmm_alpha, &xmm_alpha,
4002			       &xmm_mask_lo, &xmm_mask_hi,
4003			       &xmm_dst0, &xmm_dst1);
4004	    }
4005
4006	    /* Second round */
4007	    pack_cmp = _mm_movemask_epi8 (
4008		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4009
4010	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4011
4012	    if (pack_cmp != 0xffff)
4013	    {
4014		in_over_2x128 (&xmm_src, &xmm_src,
4015			       &xmm_alpha, &xmm_alpha,
4016			       &xmm_mask_lo, &xmm_mask_hi,
4017			       &xmm_dst2, &xmm_dst3);
4018	    }
4019
4020	    save_128_aligned (
4021		(__m128i*)dst, pack_565_4x128_128 (
4022		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4023
4024	    w -= 8;
4025	    dst += 8;
4026	    mask += 8;
4027	}
4028
4029	while (w)
4030	{
4031	    m = *(uint32_t *) mask;
4032
4033	    if (m)
4034	    {
4035		d = *dst;
4036		mmx_mask = unpack_32_1x128 (m);
4037		mmx_dest = expand565_16_1x128 (d);
4038
4039		*dst = pack_565_32_16 (
4040		    pack_1x128_32 (
4041			in_over_1x128 (
4042			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4043	    }
4044
4045	    w--;
4046	    dst++;
4047	    mask++;
4048	}
4049    }
4050
4051}
4052
4053static void
4054sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4055                         pixman_composite_info_t *info)
4056{
4057    PIXMAN_COMPOSITE_ARGS (info);
4058    uint8_t     *dst_line, *dst;
4059    uint8_t     *mask_line, *mask;
4060    int dst_stride, mask_stride;
4061    uint32_t d, m;
4062    uint32_t src;
4063    int32_t w;
4064
4065    __m128i xmm_alpha;
4066    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4067    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4068
4069    PIXMAN_IMAGE_GET_LINE (
4070	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4071    PIXMAN_IMAGE_GET_LINE (
4072	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4073
4074    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4075
4076    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4077
4078    while (height--)
4079    {
4080	dst = dst_line;
4081	dst_line += dst_stride;
4082	mask = mask_line;
4083	mask_line += mask_stride;
4084	w = width;
4085
4086	while (w && ((uintptr_t)dst & 15))
4087	{
4088	    m = (uint32_t) *mask++;
4089	    d = (uint32_t) *dst;
4090
4091	    *dst++ = (uint8_t) pack_1x128_32 (
4092		pix_multiply_1x128 (
4093		    pix_multiply_1x128 (xmm_alpha,
4094				       unpack_32_1x128 (m)),
4095		    unpack_32_1x128 (d)));
4096	    w--;
4097	}
4098
4099	while (w >= 16)
4100	{
4101	    xmm_mask = load_128_unaligned ((__m128i*)mask);
4102	    xmm_dst = load_128_aligned ((__m128i*)dst);
4103
4104	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4105	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4106
4107	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4108				&xmm_mask_lo, &xmm_mask_hi,
4109				&xmm_mask_lo, &xmm_mask_hi);
4110
4111	    pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4112				&xmm_dst_lo, &xmm_dst_hi,
4113				&xmm_dst_lo, &xmm_dst_hi);
4114
4115	    save_128_aligned (
4116		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4117
4118	    mask += 16;
4119	    dst += 16;
4120	    w -= 16;
4121	}
4122
4123	while (w)
4124	{
4125	    m = (uint32_t) *mask++;
4126	    d = (uint32_t) *dst;
4127
4128	    *dst++ = (uint8_t) pack_1x128_32 (
4129		pix_multiply_1x128 (
4130		    pix_multiply_1x128 (
4131			xmm_alpha, unpack_32_1x128 (m)),
4132		    unpack_32_1x128 (d)));
4133	    w--;
4134	}
4135    }
4136
4137}
4138
4139static void
4140sse2_composite_in_n_8 (pixman_implementation_t *imp,
4141		       pixman_composite_info_t *info)
4142{
4143    PIXMAN_COMPOSITE_ARGS (info);
4144    uint8_t     *dst_line, *dst;
4145    int dst_stride;
4146    uint32_t d;
4147    uint32_t src;
4148    int32_t w;
4149
4150    __m128i xmm_alpha;
4151    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4152
4153    PIXMAN_IMAGE_GET_LINE (
4154	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4155
4156    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4157
4158    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4159
4160    src = src >> 24;
4161
4162    if (src == 0xff)
4163	return;
4164
4165    if (src == 0x00)
4166    {
4167	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4168		     8, dest_x, dest_y, width, height, src);
4169
4170	return;
4171    }
4172
4173    while (height--)
4174    {
4175	dst = dst_line;
4176	dst_line += dst_stride;
4177	w = width;
4178
4179	while (w && ((uintptr_t)dst & 15))
4180	{
4181	    d = (uint32_t) *dst;
4182
4183	    *dst++ = (uint8_t) pack_1x128_32 (
4184		pix_multiply_1x128 (
4185		    xmm_alpha,
4186		    unpack_32_1x128 (d)));
4187	    w--;
4188	}
4189
4190	while (w >= 16)
4191	{
4192	    xmm_dst = load_128_aligned ((__m128i*)dst);
4193
4194	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4195
4196	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4197				&xmm_dst_lo, &xmm_dst_hi,
4198				&xmm_dst_lo, &xmm_dst_hi);
4199
4200	    save_128_aligned (
4201		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4202
4203	    dst += 16;
4204	    w -= 16;
4205	}
4206
4207	while (w)
4208	{
4209	    d = (uint32_t) *dst;
4210
4211	    *dst++ = (uint8_t) pack_1x128_32 (
4212		pix_multiply_1x128 (
4213		    xmm_alpha,
4214		    unpack_32_1x128 (d)));
4215	    w--;
4216	}
4217    }
4218
4219}
4220
4221static void
4222sse2_composite_in_8_8 (pixman_implementation_t *imp,
4223                       pixman_composite_info_t *info)
4224{
4225    PIXMAN_COMPOSITE_ARGS (info);
4226    uint8_t     *dst_line, *dst;
4227    uint8_t     *src_line, *src;
4228    int src_stride, dst_stride;
4229    int32_t w;
4230    uint32_t s, d;
4231
4232    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4233    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4234
4235    PIXMAN_IMAGE_GET_LINE (
4236	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4237    PIXMAN_IMAGE_GET_LINE (
4238	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4239
4240    while (height--)
4241    {
4242	dst = dst_line;
4243	dst_line += dst_stride;
4244	src = src_line;
4245	src_line += src_stride;
4246	w = width;
4247
4248	while (w && ((uintptr_t)dst & 15))
4249	{
4250	    s = (uint32_t) *src++;
4251	    d = (uint32_t) *dst;
4252
4253	    *dst++ = (uint8_t) pack_1x128_32 (
4254		pix_multiply_1x128 (
4255		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
4256	    w--;
4257	}
4258
4259	while (w >= 16)
4260	{
4261	    xmm_src = load_128_unaligned ((__m128i*)src);
4262	    xmm_dst = load_128_aligned ((__m128i*)dst);
4263
4264	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4265	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4266
4267	    pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4268				&xmm_dst_lo, &xmm_dst_hi,
4269				&xmm_dst_lo, &xmm_dst_hi);
4270
4271	    save_128_aligned (
4272		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4273
4274	    src += 16;
4275	    dst += 16;
4276	    w -= 16;
4277	}
4278
4279	while (w)
4280	{
4281	    s = (uint32_t) *src++;
4282	    d = (uint32_t) *dst;
4283
4284	    *dst++ = (uint8_t) pack_1x128_32 (
4285		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4286	    w--;
4287	}
4288    }
4289
4290}
4291
4292static void
4293sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4294			  pixman_composite_info_t *info)
4295{
4296    PIXMAN_COMPOSITE_ARGS (info);
4297    uint8_t     *dst_line, *dst;
4298    uint8_t     *mask_line, *mask;
4299    int dst_stride, mask_stride;
4300    int32_t w;
4301    uint32_t src;
4302    uint32_t m, d;
4303
4304    __m128i xmm_alpha;
4305    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4306    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4307
4308    PIXMAN_IMAGE_GET_LINE (
4309	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4310    PIXMAN_IMAGE_GET_LINE (
4311	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4312
4313    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4314
4315    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4316
4317    while (height--)
4318    {
4319	dst = dst_line;
4320	dst_line += dst_stride;
4321	mask = mask_line;
4322	mask_line += mask_stride;
4323	w = width;
4324
4325	while (w && ((uintptr_t)dst & 15))
4326	{
4327	    m = (uint32_t) *mask++;
4328	    d = (uint32_t) *dst;
4329
4330	    *dst++ = (uint8_t) pack_1x128_32 (
4331		_mm_adds_epu16 (
4332		    pix_multiply_1x128 (
4333			xmm_alpha, unpack_32_1x128 (m)),
4334		    unpack_32_1x128 (d)));
4335	    w--;
4336	}
4337
4338	while (w >= 16)
4339	{
4340	    xmm_mask = load_128_unaligned ((__m128i*)mask);
4341	    xmm_dst = load_128_aligned ((__m128i*)dst);
4342
4343	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4344	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4345
4346	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4347				&xmm_mask_lo, &xmm_mask_hi,
4348				&xmm_mask_lo, &xmm_mask_hi);
4349
4350	    xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4351	    xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4352
4353	    save_128_aligned (
4354		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4355
4356	    mask += 16;
4357	    dst += 16;
4358	    w -= 16;
4359	}
4360
4361	while (w)
4362	{
4363	    m = (uint32_t) *mask++;
4364	    d = (uint32_t) *dst;
4365
4366	    *dst++ = (uint8_t) pack_1x128_32 (
4367		_mm_adds_epu16 (
4368		    pix_multiply_1x128 (
4369			xmm_alpha, unpack_32_1x128 (m)),
4370		    unpack_32_1x128 (d)));
4371
4372	    w--;
4373	}
4374    }
4375
4376}
4377
4378static void
4379sse2_composite_add_n_8 (pixman_implementation_t *imp,
4380			pixman_composite_info_t *info)
4381{
4382    PIXMAN_COMPOSITE_ARGS (info);
4383    uint8_t     *dst_line, *dst;
4384    int dst_stride;
4385    int32_t w;
4386    uint32_t src;
4387
4388    __m128i xmm_src;
4389
4390    PIXMAN_IMAGE_GET_LINE (
4391	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4392
4393    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4394
4395    src >>= 24;
4396
4397    if (src == 0x00)
4398	return;
4399
4400    if (src == 0xff)
4401    {
4402	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4403		     8, dest_x, dest_y, width, height, 0xff);
4404
4405	return;
4406    }
4407
4408    src = (src << 24) | (src << 16) | (src << 8) | src;
4409    xmm_src = _mm_set_epi32 (src, src, src, src);
4410
4411    while (height--)
4412    {
4413	dst = dst_line;
4414	dst_line += dst_stride;
4415	w = width;
4416
4417	while (w && ((uintptr_t)dst & 15))
4418	{
4419	    *dst = (uint8_t)_mm_cvtsi128_si32 (
4420		_mm_adds_epu8 (
4421		    xmm_src,
4422		    _mm_cvtsi32_si128 (*dst)));
4423
4424	    w--;
4425	    dst++;
4426	}
4427
4428	while (w >= 16)
4429	{
4430	    save_128_aligned (
4431		(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
4432
4433	    dst += 16;
4434	    w -= 16;
4435	}
4436
4437	while (w)
4438	{
4439	    *dst = (uint8_t)_mm_cvtsi128_si32 (
4440		_mm_adds_epu8 (
4441		    xmm_src,
4442		    _mm_cvtsi32_si128 (*dst)));
4443
4444	    w--;
4445	    dst++;
4446	}
4447    }
4448
4449}
4450
4451static void
4452sse2_composite_add_8_8 (pixman_implementation_t *imp,
4453			pixman_composite_info_t *info)
4454{
4455    PIXMAN_COMPOSITE_ARGS (info);
4456    uint8_t     *dst_line, *dst;
4457    uint8_t     *src_line, *src;
4458    int dst_stride, src_stride;
4459    int32_t w;
4460    uint16_t t;
4461
4462    PIXMAN_IMAGE_GET_LINE (
4463	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4464    PIXMAN_IMAGE_GET_LINE (
4465	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4466
4467    while (height--)
4468    {
4469	dst = dst_line;
4470	src = src_line;
4471
4472	dst_line += dst_stride;
4473	src_line += src_stride;
4474	w = width;
4475
4476	/* Small head */
4477	while (w && (uintptr_t)dst & 3)
4478	{
4479	    t = (*dst) + (*src++);
4480	    *dst++ = t | (0 - (t >> 8));
4481	    w--;
4482	}
4483
4484	sse2_combine_add_u (imp, op,
4485			    (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4486
4487	/* Small tail */
4488	dst += w & 0xfffc;
4489	src += w & 0xfffc;
4490
4491	w &= 3;
4492
4493	while (w)
4494	{
4495	    t = (*dst) + (*src++);
4496	    *dst++ = t | (0 - (t >> 8));
4497	    w--;
4498	}
4499    }
4500
4501}
4502
4503static void
4504sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4505                              pixman_composite_info_t *info)
4506{
4507    PIXMAN_COMPOSITE_ARGS (info);
4508    uint32_t    *dst_line, *dst;
4509    uint32_t    *src_line, *src;
4510    int dst_stride, src_stride;
4511
4512    PIXMAN_IMAGE_GET_LINE (
4513	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4514    PIXMAN_IMAGE_GET_LINE (
4515	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4516
4517    while (height--)
4518    {
4519	dst = dst_line;
4520	dst_line += dst_stride;
4521	src = src_line;
4522	src_line += src_stride;
4523
4524	sse2_combine_add_u (imp, op, dst, src, NULL, width);
4525    }
4526}
4527
4528static void
4529sse2_composite_add_n_8888 (pixman_implementation_t *imp,
4530			   pixman_composite_info_t *info)
4531{
4532    PIXMAN_COMPOSITE_ARGS (info);
4533    uint32_t *dst_line, *dst, src;
4534    int dst_stride;
4535
4536    __m128i xmm_src;
4537
4538    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4539
4540    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4541    if (src == 0)
4542	return;
4543
4544    if (src == ~0)
4545    {
4546	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
4547		     dest_x, dest_y, width, height, ~0);
4548
4549	return;
4550    }
4551
4552    xmm_src = _mm_set_epi32 (src, src, src, src);
4553    while (height--)
4554    {
4555	int w = width;
4556	uint32_t d;
4557
4558	dst = dst_line;
4559	dst_line += dst_stride;
4560
4561	while (w && (uintptr_t)dst & 15)
4562	{
4563	    d = *dst;
4564	    *dst++ =
4565		_mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
4566	    w--;
4567	}
4568
4569	while (w >= 4)
4570	{
4571	    save_128_aligned
4572		((__m128i*)dst,
4573		 _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4574
4575	    dst += 4;
4576	    w -= 4;
4577	}
4578
4579	while (w--)
4580	{
4581	    d = *dst;
4582	    *dst++ =
4583		_mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
4584						  _mm_cvtsi32_si128 (d)));
4585	}
4586    }
4587}
4588
4589static void
4590sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
4591			     pixman_composite_info_t *info)
4592{
4593    PIXMAN_COMPOSITE_ARGS (info);
4594    uint32_t     *dst_line, *dst;
4595    uint8_t     *mask_line, *mask;
4596    int dst_stride, mask_stride;
4597    int32_t w;
4598    uint32_t src;
4599
4600    __m128i xmm_src;
4601
4602    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4603    if (src == 0)
4604	return;
4605    xmm_src = expand_pixel_32_1x128 (src);
4606
4607    PIXMAN_IMAGE_GET_LINE (
4608	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4609    PIXMAN_IMAGE_GET_LINE (
4610	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4611
4612    while (height--)
4613    {
4614	dst = dst_line;
4615	dst_line += dst_stride;
4616	mask = mask_line;
4617	mask_line += mask_stride;
4618	w = width;
4619
4620	while (w && ((uintptr_t)dst & 15))
4621	{
4622	    uint8_t m = *mask++;
4623	    if (m)
4624	    {
4625		*dst = pack_1x128_32
4626		    (_mm_adds_epu16
4627		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4628		      unpack_32_1x128 (*dst)));
4629	    }
4630	    dst++;
4631	    w--;
4632	}
4633
4634	while (w >= 4)
4635	{
4636	    uint32_t m = *(uint32_t*)mask;
4637	    if (m)
4638	    {
4639		__m128i xmm_mask_lo, xmm_mask_hi;
4640		__m128i xmm_dst_lo, xmm_dst_hi;
4641
4642		__m128i xmm_dst = load_128_aligned ((__m128i*)dst);
4643		__m128i xmm_mask =
4644		    _mm_unpacklo_epi8 (unpack_32_1x128(m),
4645				       _mm_setzero_si128 ());
4646
4647		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4648		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4649
4650		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4651					&xmm_mask_lo, &xmm_mask_hi);
4652
4653		pix_multiply_2x128 (&xmm_src, &xmm_src,
4654				    &xmm_mask_lo, &xmm_mask_hi,
4655				    &xmm_mask_lo, &xmm_mask_hi);
4656
4657		xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4658		xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4659
4660		save_128_aligned (
4661		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4662	    }
4663
4664	    w -= 4;
4665	    dst += 4;
4666	    mask += 4;
4667	}
4668
4669	while (w)
4670	{
4671	    uint8_t m = *mask++;
4672	    if (m)
4673	    {
4674		*dst = pack_1x128_32
4675		    (_mm_adds_epu16
4676		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4677		      unpack_32_1x128 (*dst)));
4678	    }
4679	    dst++;
4680	    w--;
4681	}
4682    }
4683}
4684
4685static pixman_bool_t
4686sse2_blt (pixman_implementation_t *imp,
4687          uint32_t *               src_bits,
4688          uint32_t *               dst_bits,
4689          int                      src_stride,
4690          int                      dst_stride,
4691          int                      src_bpp,
4692          int                      dst_bpp,
4693          int                      src_x,
4694          int                      src_y,
4695          int                      dest_x,
4696          int                      dest_y,
4697          int                      width,
4698          int                      height)
4699{
4700    uint8_t *   src_bytes;
4701    uint8_t *   dst_bytes;
4702    int byte_width;
4703
4704    if (src_bpp != dst_bpp)
4705	return FALSE;
4706
4707    if (src_bpp == 16)
4708    {
4709	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4710	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4711	src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4712	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4713	byte_width = 2 * width;
4714	src_stride *= 2;
4715	dst_stride *= 2;
4716    }
4717    else if (src_bpp == 32)
4718    {
4719	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4720	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4721	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4722	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4723	byte_width = 4 * width;
4724	src_stride *= 4;
4725	dst_stride *= 4;
4726    }
4727    else
4728    {
4729	return FALSE;
4730    }
4731
4732    while (height--)
4733    {
4734	int w;
4735	uint8_t *s = src_bytes;
4736	uint8_t *d = dst_bytes;
4737	src_bytes += src_stride;
4738	dst_bytes += dst_stride;
4739	w = byte_width;
4740
4741	while (w >= 2 && ((uintptr_t)d & 3))
4742	{
4743	    *(uint16_t *)d = *(uint16_t *)s;
4744	    w -= 2;
4745	    s += 2;
4746	    d += 2;
4747	}
4748
4749	while (w >= 4 && ((uintptr_t)d & 15))
4750	{
4751	    *(uint32_t *)d = *(uint32_t *)s;
4752
4753	    w -= 4;
4754	    s += 4;
4755	    d += 4;
4756	}
4757
4758	while (w >= 64)
4759	{
4760	    __m128i xmm0, xmm1, xmm2, xmm3;
4761
4762	    xmm0 = load_128_unaligned ((__m128i*)(s));
4763	    xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4764	    xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4765	    xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4766
4767	    save_128_aligned ((__m128i*)(d),    xmm0);
4768	    save_128_aligned ((__m128i*)(d + 16), xmm1);
4769	    save_128_aligned ((__m128i*)(d + 32), xmm2);
4770	    save_128_aligned ((__m128i*)(d + 48), xmm3);
4771
4772	    s += 64;
4773	    d += 64;
4774	    w -= 64;
4775	}
4776
4777	while (w >= 16)
4778	{
4779	    save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4780
4781	    w -= 16;
4782	    d += 16;
4783	    s += 16;
4784	}
4785
4786	while (w >= 4)
4787	{
4788	    *(uint32_t *)d = *(uint32_t *)s;
4789
4790	    w -= 4;
4791	    s += 4;
4792	    d += 4;
4793	}
4794
4795	if (w >= 2)
4796	{
4797	    *(uint16_t *)d = *(uint16_t *)s;
4798	    w -= 2;
4799	    s += 2;
4800	    d += 2;
4801	}
4802    }
4803
4804    return TRUE;
4805}
4806
4807static void
4808sse2_composite_copy_area (pixman_implementation_t *imp,
4809                          pixman_composite_info_t *info)
4810{
4811    PIXMAN_COMPOSITE_ARGS (info);
4812    sse2_blt (imp, src_image->bits.bits,
4813	      dest_image->bits.bits,
4814	      src_image->bits.rowstride,
4815	      dest_image->bits.rowstride,
4816	      PIXMAN_FORMAT_BPP (src_image->bits.format),
4817	      PIXMAN_FORMAT_BPP (dest_image->bits.format),
4818	      src_x, src_y, dest_x, dest_y, width, height);
4819}
4820
4821static void
4822sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4823                                 pixman_composite_info_t *info)
4824{
4825    PIXMAN_COMPOSITE_ARGS (info);
4826    uint32_t    *src, *src_line, s;
4827    uint32_t    *dst, *dst_line, d;
4828    uint8_t         *mask, *mask_line;
4829    uint32_t m;
4830    int src_stride, mask_stride, dst_stride;
4831    int32_t w;
4832    __m128i ms;
4833
4834    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4835    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4836    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4837
4838    PIXMAN_IMAGE_GET_LINE (
4839	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4840    PIXMAN_IMAGE_GET_LINE (
4841	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4842    PIXMAN_IMAGE_GET_LINE (
4843	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4844
4845    while (height--)
4846    {
4847        src = src_line;
4848        src_line += src_stride;
4849        dst = dst_line;
4850        dst_line += dst_stride;
4851        mask = mask_line;
4852        mask_line += mask_stride;
4853
4854        w = width;
4855
4856        while (w && (uintptr_t)dst & 15)
4857        {
4858            s = 0xff000000 | *src++;
4859            m = (uint32_t) *mask++;
4860            d = *dst;
4861            ms = unpack_32_1x128 (s);
4862
4863            if (m != 0xff)
4864            {
4865		__m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4866		__m128i md = unpack_32_1x128 (d);
4867
4868                ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
4869            }
4870
4871            *dst++ = pack_1x128_32 (ms);
4872            w--;
4873        }
4874
4875        while (w >= 4)
4876        {
4877            m = *(uint32_t*) mask;
4878            xmm_src = _mm_or_si128 (
4879		load_128_unaligned ((__m128i*)src), mask_ff000000);
4880
4881            if (m == 0xffffffff)
4882            {
4883                save_128_aligned ((__m128i*)dst, xmm_src);
4884            }
4885            else
4886            {
4887                xmm_dst = load_128_aligned ((__m128i*)dst);
4888
4889                xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4890
4891                unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4892                unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4893                unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4894
4895                expand_alpha_rev_2x128 (
4896		    xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4897
4898                in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
4899			       &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
4900			       &xmm_dst_lo, &xmm_dst_hi);
4901
4902                save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4903            }
4904
4905            src += 4;
4906            dst += 4;
4907            mask += 4;
4908            w -= 4;
4909        }
4910
4911        while (w)
4912        {
4913            m = (uint32_t) *mask++;
4914
4915            if (m)
4916            {
4917                s = 0xff000000 | *src;
4918
4919                if (m == 0xff)
4920                {
4921                    *dst = s;
4922                }
4923                else
4924                {
4925		    __m128i ma, md, ms;
4926
4927                    d = *dst;
4928
4929		    ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4930		    md = unpack_32_1x128 (d);
4931		    ms = unpack_32_1x128 (s);
4932
4933                    *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
4934                }
4935
4936            }
4937
4938            src++;
4939            dst++;
4940            w--;
4941        }
4942    }
4943
4944}
4945
4946static void
4947sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
4948                                 pixman_composite_info_t *info)
4949{
4950    PIXMAN_COMPOSITE_ARGS (info);
4951    uint32_t    *src, *src_line, s;
4952    uint32_t    *dst, *dst_line, d;
4953    uint8_t         *mask, *mask_line;
4954    uint32_t m;
4955    int src_stride, mask_stride, dst_stride;
4956    int32_t w;
4957
4958    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
4959    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4960    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4961
4962    PIXMAN_IMAGE_GET_LINE (
4963	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4964    PIXMAN_IMAGE_GET_LINE (
4965	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4966    PIXMAN_IMAGE_GET_LINE (
4967	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4968
4969    while (height--)
4970    {
4971        src = src_line;
4972        src_line += src_stride;
4973        dst = dst_line;
4974        dst_line += dst_stride;
4975        mask = mask_line;
4976        mask_line += mask_stride;
4977
4978        w = width;
4979
4980        while (w && (uintptr_t)dst & 15)
4981        {
4982	    uint32_t sa;
4983
4984            s = *src++;
4985            m = (uint32_t) *mask++;
4986            d = *dst;
4987
4988	    sa = s >> 24;
4989
4990	    if (m)
4991	    {
4992		if (sa == 0xff && m == 0xff)
4993		{
4994		    *dst = s;
4995		}
4996		else
4997		{
4998		    __m128i ms, md, ma, msa;
4999
5000		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5001		    ms = unpack_32_1x128 (s);
5002		    md = unpack_32_1x128 (d);
5003
5004		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5005
5006		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5007		}
5008	    }
5009
5010	    dst++;
5011            w--;
5012        }
5013
5014        while (w >= 4)
5015        {
5016            m = *(uint32_t *) mask;
5017
5018	    if (m)
5019	    {
5020		xmm_src = load_128_unaligned ((__m128i*)src);
5021
5022		if (m == 0xffffffff && is_opaque (xmm_src))
5023		{
5024		    save_128_aligned ((__m128i *)dst, xmm_src);
5025		}
5026		else
5027		{
5028		    xmm_dst = load_128_aligned ((__m128i *)dst);
5029
5030		    xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5031
5032		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5033		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5034		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5035
5036		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5037		    expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5038
5039		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5040				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5041
5042		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5043		}
5044	    }
5045
5046            src += 4;
5047            dst += 4;
5048            mask += 4;
5049            w -= 4;
5050        }
5051
5052        while (w)
5053        {
5054	    uint32_t sa;
5055
5056            s = *src++;
5057            m = (uint32_t) *mask++;
5058            d = *dst;
5059
5060	    sa = s >> 24;
5061
5062	    if (m)
5063	    {
5064		if (sa == 0xff && m == 0xff)
5065		{
5066		    *dst = s;
5067		}
5068		else
5069		{
5070		    __m128i ms, md, ma, msa;
5071
5072		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5073		    ms = unpack_32_1x128 (s);
5074		    md = unpack_32_1x128 (d);
5075
5076		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5077
5078		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5079		}
5080	    }
5081
5082	    dst++;
5083            w--;
5084        }
5085    }
5086
5087}
5088
5089static void
5090sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5091				    pixman_composite_info_t *info)
5092{
5093    PIXMAN_COMPOSITE_ARGS (info);
5094    uint32_t src;
5095    uint32_t    *dst_line, *dst;
5096    __m128i xmm_src;
5097    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5098    __m128i xmm_dsta_hi, xmm_dsta_lo;
5099    int dst_stride;
5100    int32_t w;
5101
5102    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
5103
5104    if (src == 0)
5105	return;
5106
5107    PIXMAN_IMAGE_GET_LINE (
5108	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5109
5110    xmm_src = expand_pixel_32_1x128 (src);
5111
5112    while (height--)
5113    {
5114	dst = dst_line;
5115
5116	dst_line += dst_stride;
5117	w = width;
5118
5119	while (w && (uintptr_t)dst & 15)
5120	{
5121	    __m128i vd;
5122
5123	    vd = unpack_32_1x128 (*dst);
5124
5125	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5126					      xmm_src));
5127	    w--;
5128	    dst++;
5129	}
5130
5131	while (w >= 4)
5132	{
5133	    __m128i tmp_lo, tmp_hi;
5134
5135	    xmm_dst = load_128_aligned ((__m128i*)dst);
5136
5137	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5138	    expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5139
5140	    tmp_lo = xmm_src;
5141	    tmp_hi = xmm_src;
5142
5143	    over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5144			&xmm_dsta_lo, &xmm_dsta_hi,
5145			&tmp_lo, &tmp_hi);
5146
5147	    save_128_aligned (
5148		(__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5149
5150	    w -= 4;
5151	    dst += 4;
5152	}
5153
5154	while (w)
5155	{
5156	    __m128i vd;
5157
5158	    vd = unpack_32_1x128 (*dst);
5159
5160	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5161					      xmm_src));
5162	    w--;
5163	    dst++;
5164	}
5165
5166    }
5167
5168}
5169
5170static void
5171sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5172				    pixman_composite_info_t *info)
5173{
5174    PIXMAN_COMPOSITE_ARGS (info);
5175    uint32_t    *src, *src_line, s;
5176    uint32_t    *dst, *dst_line, d;
5177    uint32_t    *mask, *mask_line;
5178    uint32_t    m;
5179    int src_stride, mask_stride, dst_stride;
5180    int32_t w;
5181
5182    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5183    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5184    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5185
5186    PIXMAN_IMAGE_GET_LINE (
5187	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5188    PIXMAN_IMAGE_GET_LINE (
5189	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5190    PIXMAN_IMAGE_GET_LINE (
5191	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5192
5193    while (height--)
5194    {
5195        src = src_line;
5196        src_line += src_stride;
5197        dst = dst_line;
5198        dst_line += dst_stride;
5199        mask = mask_line;
5200        mask_line += mask_stride;
5201
5202        w = width;
5203
5204        while (w && (uintptr_t)dst & 15)
5205        {
5206	    uint32_t sa;
5207
5208            s = *src++;
5209            m = (*mask++) >> 24;
5210            d = *dst;
5211
5212	    sa = s >> 24;
5213
5214	    if (m)
5215	    {
5216		if (sa == 0xff && m == 0xff)
5217		{
5218		    *dst = s;
5219		}
5220		else
5221		{
5222		    __m128i ms, md, ma, msa;
5223
5224		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5225		    ms = unpack_32_1x128 (s);
5226		    md = unpack_32_1x128 (d);
5227
5228		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5229
5230		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5231		}
5232	    }
5233
5234	    dst++;
5235            w--;
5236        }
5237
5238        while (w >= 4)
5239        {
5240	    xmm_mask = load_128_unaligned ((__m128i*)mask);
5241
5242	    if (!is_transparent (xmm_mask))
5243	    {
5244		xmm_src = load_128_unaligned ((__m128i*)src);
5245
5246		if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5247		{
5248		    save_128_aligned ((__m128i *)dst, xmm_src);
5249		}
5250		else
5251		{
5252		    xmm_dst = load_128_aligned ((__m128i *)dst);
5253
5254		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5255		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5256		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5257
5258		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5259		    expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5260
5261		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5262				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5263
5264		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5265		}
5266	    }
5267
5268            src += 4;
5269            dst += 4;
5270            mask += 4;
5271            w -= 4;
5272        }
5273
5274        while (w)
5275        {
5276	    uint32_t sa;
5277
5278            s = *src++;
5279            m = (*mask++) >> 24;
5280            d = *dst;
5281
5282	    sa = s >> 24;
5283
5284	    if (m)
5285	    {
5286		if (sa == 0xff && m == 0xff)
5287		{
5288		    *dst = s;
5289		}
5290		else
5291		{
5292		    __m128i ms, md, ma, msa;
5293
5294		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5295		    ms = unpack_32_1x128 (s);
5296		    md = unpack_32_1x128 (d);
5297
5298		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5299
5300		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5301		}
5302	    }
5303
5304	    dst++;
5305            w--;
5306        }
5307    }
5308
5309}
5310
5311/* A variant of 'sse2_combine_over_u' with minor tweaks */
5312static force_inline void
5313scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
5314                                             const uint32_t* ps,
5315                                             int32_t         w,
5316                                             pixman_fixed_t  vx,
5317                                             pixman_fixed_t  unit_x,
5318                                             pixman_fixed_t  src_width_fixed,
5319                                             pixman_bool_t   fully_transparent_src)
5320{
5321    uint32_t s, d;
5322    const uint32_t* pm = NULL;
5323
5324    __m128i xmm_dst_lo, xmm_dst_hi;
5325    __m128i xmm_src_lo, xmm_src_hi;
5326    __m128i xmm_alpha_lo, xmm_alpha_hi;
5327
5328    if (fully_transparent_src)
5329	return;
5330
5331    /* Align dst on a 16-byte boundary */
5332    while (w && ((uintptr_t)pd & 15))
5333    {
5334	d = *pd;
5335	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5336	vx += unit_x;
5337	while (vx >= 0)
5338	    vx -= src_width_fixed;
5339
5340	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
5341	if (pm)
5342	    pm++;
5343	w--;
5344    }
5345
5346    while (w >= 4)
5347    {
5348	__m128i tmp;
5349	uint32_t tmp1, tmp2, tmp3, tmp4;
5350
5351	tmp1 = *(ps + pixman_fixed_to_int (vx));
5352	vx += unit_x;
5353	while (vx >= 0)
5354	    vx -= src_width_fixed;
5355	tmp2 = *(ps + pixman_fixed_to_int (vx));
5356	vx += unit_x;
5357	while (vx >= 0)
5358	    vx -= src_width_fixed;
5359	tmp3 = *(ps + pixman_fixed_to_int (vx));
5360	vx += unit_x;
5361	while (vx >= 0)
5362	    vx -= src_width_fixed;
5363	tmp4 = *(ps + pixman_fixed_to_int (vx));
5364	vx += unit_x;
5365	while (vx >= 0)
5366	    vx -= src_width_fixed;
5367
5368	tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5369
5370	xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5371
5372	if (is_opaque (xmm_src_hi))
5373	{
5374	    save_128_aligned ((__m128i*)pd, xmm_src_hi);
5375	}
5376	else if (!is_zero (xmm_src_hi))
5377	{
5378	    xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5379
5380	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5381	    unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5382
5383	    expand_alpha_2x128 (
5384		xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5385
5386	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
5387			&xmm_alpha_lo, &xmm_alpha_hi,
5388			&xmm_dst_lo, &xmm_dst_hi);
5389
5390	    /* rebuid the 4 pixel data and save*/
5391	    save_128_aligned ((__m128i*)pd,
5392			      pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5393	}
5394
5395	w -= 4;
5396	pd += 4;
5397	if (pm)
5398	    pm += 4;
5399    }
5400
5401    while (w)
5402    {
5403	d = *pd;
5404	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5405	vx += unit_x;
5406	while (vx >= 0)
5407	    vx -= src_width_fixed;
5408
5409	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
5410	if (pm)
5411	    pm++;
5412
5413	w--;
5414    }
5415}
5416
5417FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5418		       scaled_nearest_scanline_sse2_8888_8888_OVER,
5419		       uint32_t, uint32_t, COVER)
5420FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5421		       scaled_nearest_scanline_sse2_8888_8888_OVER,
5422		       uint32_t, uint32_t, NONE)
5423FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5424		       scaled_nearest_scanline_sse2_8888_8888_OVER,
5425		       uint32_t, uint32_t, PAD)
5426FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
5427		       scaled_nearest_scanline_sse2_8888_8888_OVER,
5428		       uint32_t, uint32_t, NORMAL)
5429
5430static force_inline void
5431scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5432					       uint32_t *       dst,
5433					       const uint32_t * src,
5434					       int32_t          w,
5435					       pixman_fixed_t   vx,
5436					       pixman_fixed_t   unit_x,
5437					       pixman_fixed_t   src_width_fixed,
5438					       pixman_bool_t    zero_src)
5439{
5440    __m128i xmm_mask;
5441    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5442    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5443    __m128i xmm_alpha_lo, xmm_alpha_hi;
5444
5445    if (zero_src || (*mask >> 24) == 0)
5446	return;
5447
5448    xmm_mask = create_mask_16_128 (*mask >> 24);
5449
5450    while (w && (uintptr_t)dst & 15)
5451    {
5452	uint32_t s = *(src + pixman_fixed_to_int (vx));
5453	vx += unit_x;
5454	while (vx >= 0)
5455	    vx -= src_width_fixed;
5456
5457	if (s)
5458	{
5459	    uint32_t d = *dst;
5460
5461	    __m128i ms = unpack_32_1x128 (s);
5462	    __m128i alpha     = expand_alpha_1x128 (ms);
5463	    __m128i dest      = xmm_mask;
5464	    __m128i alpha_dst = unpack_32_1x128 (d);
5465
5466	    *dst = pack_1x128_32 (
5467		in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5468	}
5469	dst++;
5470	w--;
5471    }
5472
5473    while (w >= 4)
5474    {
5475	uint32_t tmp1, tmp2, tmp3, tmp4;
5476
5477	tmp1 = *(src + pixman_fixed_to_int (vx));
5478	vx += unit_x;
5479	while (vx >= 0)
5480	    vx -= src_width_fixed;
5481	tmp2 = *(src + pixman_fixed_to_int (vx));
5482	vx += unit_x;
5483	while (vx >= 0)
5484	    vx -= src_width_fixed;
5485	tmp3 = *(src + pixman_fixed_to_int (vx));
5486	vx += unit_x;
5487	while (vx >= 0)
5488	    vx -= src_width_fixed;
5489	tmp4 = *(src + pixman_fixed_to_int (vx));
5490	vx += unit_x;
5491	while (vx >= 0)
5492	    vx -= src_width_fixed;
5493
5494	xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5495
5496	if (!is_zero (xmm_src))
5497	{
5498	    xmm_dst = load_128_aligned ((__m128i*)dst);
5499
5500	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5501	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5502	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5503			        &xmm_alpha_lo, &xmm_alpha_hi);
5504
5505	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
5506			   &xmm_alpha_lo, &xmm_alpha_hi,
5507			   &xmm_mask, &xmm_mask,
5508			   &xmm_dst_lo, &xmm_dst_hi);
5509
5510	    save_128_aligned (
5511		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5512	}
5513
5514	dst += 4;
5515	w -= 4;
5516    }
5517
5518    while (w)
5519    {
5520	uint32_t s = *(src + pixman_fixed_to_int (vx));
5521	vx += unit_x;
5522	while (vx >= 0)
5523	    vx -= src_width_fixed;
5524
5525	if (s)
5526	{
5527	    uint32_t d = *dst;
5528
5529	    __m128i ms = unpack_32_1x128 (s);
5530	    __m128i alpha = expand_alpha_1x128 (ms);
5531	    __m128i mask  = xmm_mask;
5532	    __m128i dest  = unpack_32_1x128 (d);
5533
5534	    *dst = pack_1x128_32 (
5535		in_over_1x128 (&ms, &alpha, &mask, &dest));
5536	}
5537
5538	dst++;
5539	w--;
5540    }
5541
5542}
5543
5544FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
5545			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5546			      uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
5547FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
5548			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5549			      uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
5550FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
5551			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5552			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
5553FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
5554			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5555			      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
5556
5557#if BILINEAR_INTERPOLATION_BITS < 8
5558# define BILINEAR_DECLARE_VARIABLES						\
5559    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
5560    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
5561    const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
5562    const __m128i xmm_ux = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\
5563					  unit_x, -unit_x, unit_x, -unit_x);	\
5564    const __m128i xmm_zero = _mm_setzero_si128 ();				\
5565    __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1),		\
5566				   vx, -(vx + 1), vx, -(vx + 1))
5567#else
5568# define BILINEAR_DECLARE_VARIABLES						\
5569    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
5570    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
5571    const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);		\
5572    const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,	\
5573					  -unit_x, -unit_x, -unit_x, -unit_x);	\
5574    const __m128i xmm_zero = _mm_setzero_si128 ();				\
5575    __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx,				\
5576				   -(vx + 1), -(vx + 1), -(vx + 1), -(vx + 1))
5577#endif
5578
5579#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
5580do {										\
5581    __m128i xmm_wh, xmm_lo, xmm_hi, a;						\
5582    /* fetch 2x2 pixel block into sse2 registers */				\
5583    __m128i tltr = _mm_loadl_epi64 (						\
5584			    (__m128i *)&src_top[pixman_fixed_to_int (vx)]);	\
5585    __m128i blbr = _mm_loadl_epi64 (						\
5586			    (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]);	\
5587    vx += unit_x;								\
5588    /* vertical interpolation */						\
5589    a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero),	\
5590					xmm_wt),				\
5591		       _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero),	\
5592					xmm_wb));				\
5593    if (BILINEAR_INTERPOLATION_BITS < 8)					\
5594    {										\
5595	/* calculate horizontal weights */					\
5596	xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\
5597					16 - BILINEAR_INTERPOLATION_BITS));	\
5598	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
5599	/* horizontal interpolation */						\
5600	a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (		\
5601		a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh);			\
5602    }										\
5603    else									\
5604    {										\
5605	/* calculate horizontal weights */					\
5606	xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\
5607					16 - BILINEAR_INTERPOLATION_BITS));	\
5608	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
5609	/* horizontal interpolation */						\
5610	xmm_lo = _mm_mullo_epi16 (a, xmm_wh);					\
5611	xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);					\
5612	a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),			\
5613			   _mm_unpackhi_epi16 (xmm_lo, xmm_hi));		\
5614    }										\
5615    /* shift and pack the result */						\
5616    a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2);			\
5617    a = _mm_packs_epi32 (a, a);							\
5618    a = _mm_packus_epi16 (a, a);						\
5619    pix = _mm_cvtsi128_si32 (a);						\
5620} while (0)
5621
5622#define BILINEAR_SKIP_ONE_PIXEL()						\
5623do {										\
5624    vx += unit_x;								\
5625    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
5626} while(0)
5627
5628static force_inline void
5629scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
5630					     const uint32_t * mask,
5631					     const uint32_t * src_top,
5632					     const uint32_t * src_bottom,
5633					     int32_t          w,
5634					     int              wt,
5635					     int              wb,
5636					     pixman_fixed_t   vx,
5637					     pixman_fixed_t   unit_x,
5638					     pixman_fixed_t   max_vx,
5639					     pixman_bool_t    zero_src)
5640{
5641    BILINEAR_DECLARE_VARIABLES;
5642    uint32_t pix1, pix2, pix3, pix4;
5643
5644    while ((w -= 4) >= 0)
5645    {
5646	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5647	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5648	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
5649	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
5650	*dst++ = pix1;
5651	*dst++ = pix2;
5652	*dst++ = pix3;
5653	*dst++ = pix4;
5654    }
5655
5656    if (w & 2)
5657    {
5658	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5659	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5660	*dst++ = pix1;
5661	*dst++ = pix2;
5662    }
5663
5664    if (w & 1)
5665    {
5666	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5667	*dst = pix1;
5668    }
5669
5670}
5671
5672FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
5673			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
5674			       uint32_t, uint32_t, uint32_t,
5675			       COVER, FLAG_NONE)
5676FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
5677			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
5678			       uint32_t, uint32_t, uint32_t,
5679			       PAD, FLAG_NONE)
5680FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
5681			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
5682			       uint32_t, uint32_t, uint32_t,
5683			       NONE, FLAG_NONE)
5684FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
5685			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
5686			       uint32_t, uint32_t, uint32_t,
5687			       NORMAL, FLAG_NONE)
5688
5689static force_inline void
5690scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
5691					      const uint32_t * mask,
5692					      const uint32_t * src_top,
5693					      const uint32_t * src_bottom,
5694					      int32_t          w,
5695					      int              wt,
5696					      int              wb,
5697					      pixman_fixed_t   vx,
5698					      pixman_fixed_t   unit_x,
5699					      pixman_fixed_t   max_vx,
5700					      pixman_bool_t    zero_src)
5701{
5702    BILINEAR_DECLARE_VARIABLES;
5703    uint32_t pix1, pix2, pix3, pix4;
5704
5705    while (w && ((uintptr_t)dst & 15))
5706    {
5707	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5708
5709	if (pix1)
5710	{
5711	    pix2 = *dst;
5712	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5713	}
5714
5715	w--;
5716	dst++;
5717    }
5718
5719    while (w  >= 4)
5720    {
5721	__m128i xmm_src;
5722	__m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
5723	__m128i xmm_alpha_hi, xmm_alpha_lo;
5724
5725	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5726	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5727	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
5728	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
5729
5730	xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
5731
5732	if (!is_zero (xmm_src))
5733	{
5734	    if (is_opaque (xmm_src))
5735	    {
5736		save_128_aligned ((__m128i *)dst, xmm_src);
5737	    }
5738	    else
5739	    {
5740		__m128i xmm_dst = load_128_aligned ((__m128i *)dst);
5741
5742		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5743		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5744
5745		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5746		over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
5747			    &xmm_dst_lo, &xmm_dst_hi);
5748
5749		save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5750	    }
5751	}
5752
5753	w -= 4;
5754	dst += 4;
5755    }
5756
5757    while (w)
5758    {
5759	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5760
5761	if (pix1)
5762	{
5763	    pix2 = *dst;
5764	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5765	}
5766
5767	w--;
5768	dst++;
5769    }
5770}
5771
5772FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
5773			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
5774			       uint32_t, uint32_t, uint32_t,
5775			       COVER, FLAG_NONE)
5776FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
5777			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
5778			       uint32_t, uint32_t, uint32_t,
5779			       PAD, FLAG_NONE)
5780FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
5781			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
5782			       uint32_t, uint32_t, uint32_t,
5783			       NONE, FLAG_NONE)
5784FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
5785			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
5786			       uint32_t, uint32_t, uint32_t,
5787			       NORMAL, FLAG_NONE)
5788
5789static force_inline void
5790scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
5791						const uint8_t  * mask,
5792						const uint32_t * src_top,
5793						const uint32_t * src_bottom,
5794						int32_t          w,
5795						int              wt,
5796						int              wb,
5797						pixman_fixed_t   vx,
5798						pixman_fixed_t   unit_x,
5799						pixman_fixed_t   max_vx,
5800						pixman_bool_t    zero_src)
5801{
5802    BILINEAR_DECLARE_VARIABLES;
5803    uint32_t pix1, pix2, pix3, pix4;
5804    uint32_t m;
5805
5806    while (w && ((uintptr_t)dst & 15))
5807    {
5808	uint32_t sa;
5809
5810	m = (uint32_t) *mask++;
5811
5812	if (m)
5813	{
5814	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5815	    sa = pix1 >> 24;
5816
5817	    if (sa == 0xff && m == 0xff)
5818	    {
5819		*dst = pix1;
5820	    }
5821	    else
5822	    {
5823		__m128i ms, md, ma, msa;
5824
5825		pix2 = *dst;
5826		ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5827		ms = unpack_32_1x128 (pix1);
5828		md = unpack_32_1x128 (pix2);
5829
5830		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5831
5832		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5833	    }
5834	}
5835	else
5836	{
5837	    BILINEAR_SKIP_ONE_PIXEL ();
5838	}
5839
5840	w--;
5841	dst++;
5842    }
5843
5844    while (w >= 4)
5845    {
5846	__m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5847	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5848	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5849
5850	m = *(uint32_t*)mask;
5851
5852	if (m)
5853	{
5854	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5855	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5856	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
5857	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
5858
5859	    xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
5860
5861	    if (m == 0xffffffff && is_opaque (xmm_src))
5862	    {
5863		save_128_aligned ((__m128i *)dst, xmm_src);
5864	    }
5865	    else
5866	    {
5867		xmm_dst = load_128_aligned ((__m128i *)dst);
5868
5869		xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5870
5871		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5872		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5873		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5874
5875		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5876		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5877
5878		in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5879			       &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5880
5881		save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5882	    }
5883	}
5884	else
5885	{
5886	    BILINEAR_SKIP_ONE_PIXEL ();
5887	    BILINEAR_SKIP_ONE_PIXEL ();
5888	    BILINEAR_SKIP_ONE_PIXEL ();
5889	    BILINEAR_SKIP_ONE_PIXEL ();
5890	}
5891
5892	w -= 4;
5893	dst += 4;
5894	mask += 4;
5895    }
5896
5897    while (w)
5898    {
5899	uint32_t sa;
5900
5901	m = (uint32_t) *mask++;
5902
5903	if (m)
5904	{
5905	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5906	    sa = pix1 >> 24;
5907
5908	    if (sa == 0xff && m == 0xff)
5909	    {
5910		*dst = pix1;
5911	    }
5912	    else
5913	    {
5914		__m128i ms, md, ma, msa;
5915
5916		pix2 = *dst;
5917		ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5918		ms = unpack_32_1x128 (pix1);
5919		md = unpack_32_1x128 (pix2);
5920
5921		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5922
5923		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5924	    }
5925	}
5926	else
5927	{
5928	    BILINEAR_SKIP_ONE_PIXEL ();
5929	}
5930
5931	w--;
5932	dst++;
5933    }
5934}
5935
5936FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
5937			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
5938			       uint32_t, uint8_t, uint32_t,
5939			       COVER, FLAG_HAVE_NON_SOLID_MASK)
5940FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
5941			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
5942			       uint32_t, uint8_t, uint32_t,
5943			       PAD, FLAG_HAVE_NON_SOLID_MASK)
5944FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
5945			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
5946			       uint32_t, uint8_t, uint32_t,
5947			       NONE, FLAG_HAVE_NON_SOLID_MASK)
5948FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
5949			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
5950			       uint32_t, uint8_t, uint32_t,
5951			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
5952
5953static force_inline void
5954scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t *       dst,
5955						const uint32_t * mask,
5956						const uint32_t * src_top,
5957						const uint32_t * src_bottom,
5958						int32_t          w,
5959						int              wt,
5960						int              wb,
5961						pixman_fixed_t   vx,
5962						pixman_fixed_t   unit_x,
5963						pixman_fixed_t   max_vx,
5964						pixman_bool_t    zero_src)
5965{
5966    BILINEAR_DECLARE_VARIABLES;
5967    uint32_t pix1, pix2, pix3, pix4;
5968    __m128i xmm_mask;
5969
5970    if (zero_src || (*mask >> 24) == 0)
5971	return;
5972
5973    xmm_mask = create_mask_16_128 (*mask >> 24);
5974
5975    while (w && ((uintptr_t)dst & 15))
5976    {
5977	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5978	if (pix1)
5979	{
5980		uint32_t d = *dst;
5981
5982		__m128i ms = unpack_32_1x128 (pix1);
5983		__m128i alpha     = expand_alpha_1x128 (ms);
5984		__m128i dest      = xmm_mask;
5985		__m128i alpha_dst = unpack_32_1x128 (d);
5986
5987		*dst = pack_1x128_32
5988			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5989	}
5990
5991	dst++;
5992	w--;
5993    }
5994
5995    while (w >= 4)
5996    {
5997	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5998	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5999	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
6000	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
6001
6002	if (pix1 | pix2 | pix3 | pix4)
6003	{
6004	    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
6005	    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
6006	    __m128i xmm_alpha_lo, xmm_alpha_hi;
6007
6008	    xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
6009
6010	    xmm_dst = load_128_aligned ((__m128i*)dst);
6011
6012	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
6013	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
6014	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
6015				&xmm_alpha_lo, &xmm_alpha_hi);
6016
6017	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
6018			   &xmm_alpha_lo, &xmm_alpha_hi,
6019			   &xmm_mask, &xmm_mask,
6020			   &xmm_dst_lo, &xmm_dst_hi);
6021
6022	    save_128_aligned
6023		((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6024	}
6025
6026	dst += 4;
6027	w -= 4;
6028    }
6029
6030    while (w)
6031    {
6032	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6033	if (pix1)
6034	{
6035		uint32_t d = *dst;
6036
6037		__m128i ms = unpack_32_1x128 (pix1);
6038		__m128i alpha     = expand_alpha_1x128 (ms);
6039		__m128i dest      = xmm_mask;
6040		__m128i alpha_dst = unpack_32_1x128 (d);
6041
6042		*dst = pack_1x128_32
6043			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6044	}
6045
6046	dst++;
6047	w--;
6048    }
6049}
6050
6051FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
6052			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6053			       uint32_t, uint32_t, uint32_t,
6054			       COVER, FLAG_HAVE_SOLID_MASK)
6055FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
6056			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6057			       uint32_t, uint32_t, uint32_t,
6058			       PAD, FLAG_HAVE_SOLID_MASK)
6059FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
6060			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6061			       uint32_t, uint32_t, uint32_t,
6062			       NONE, FLAG_HAVE_SOLID_MASK)
6063FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
6064			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6065			       uint32_t, uint32_t, uint32_t,
6066			       NORMAL, FLAG_HAVE_SOLID_MASK)
6067
6068static const pixman_fast_path_t sse2_fast_paths[] =
6069{
6070    /* PIXMAN_OP_OVER */
6071    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
6072    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
6073    PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
6074    PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
6075    PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
6076    PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565),
6077    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
6078    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
6079    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
6080    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
6081    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
6082    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
6083    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
6084    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
6085    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
6086    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
6087    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
6088    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
6089    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
6090    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
6091    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
6092    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
6093    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
6094    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
6095    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
6096    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
6097    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
6098    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
6099    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
6100    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
6101    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
6102    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
6103    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
6104    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6105    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6106    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6107    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6108    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
6109    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
6110    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
6111    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
6112    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
6113    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
6114    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
6115    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
6116    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6117    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6118
6119    /* PIXMAN_OP_OVER_REVERSE */
6120    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
6121    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
6122
6123    /* PIXMAN_OP_ADD */
6124    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
6125    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
6126    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
6127    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
6128    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
6129    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
6130    PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
6131    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
6132    PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
6133    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
6134    PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
6135    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
6136    PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
6137    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
6138
6139    /* PIXMAN_OP_SRC */
6140    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
6141    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
6142    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
6143    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
6144    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6145    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6146    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6147    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6148    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
6149    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
6150    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
6151    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
6152    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6153    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6154    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6155    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6156    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
6157    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
6158
6159    /* PIXMAN_OP_IN */
6160    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
6161    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
6162    PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
6163
6164    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6165    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6166    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6167    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6168    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6169    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6170    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6171    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6172    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6173    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6174    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6175    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6176    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6177    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6178    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6179    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6180
6181    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6182    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6183    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6184    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6185    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6186    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6187    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6188    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6189
6190    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6191    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6192    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
6193    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6194    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6195    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
6196
6197    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6198