1/*
2 * Copyright © 2004, 2005 Red Hat, Inc.
3 * Copyright © 2004 Nicholas Miell
4 * Copyright © 2005 Trolltech AS
5 *
6 * Permission to use, copy, modify, distribute, and sell this software and its
7 * documentation for any purpose is hereby granted without fee, provided that
8 * the above copyright notice appear in all copies and that both that
9 * copyright notice and this permission notice appear in supporting
10 * documentation, and that the name of Red Hat not be used in advertising or
11 * publicity pertaining to distribution of the software without specific,
12 * written prior permission.  Red Hat makes no representations about the
13 * suitability of this software for any purpose.  It is provided "as is"
14 * without express or implied warranty.
15 *
16 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23 * SOFTWARE.
24 *
25 * Author:  Søren Sandmann (sandmann@redhat.com)
26 * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
28 *
29 * Based on work by Owen Taylor
30 */
31
32#ifdef HAVE_CONFIG_H
33#include <config.h>
34#endif
35
36#if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
37
38#ifdef USE_LOONGSON_MMI
39#include <loongson-mmintrin.h>
40#else
41#include <mmintrin.h>
42#endif
43#include "pixman-private.h"
44#include "pixman-combine32.h"
45#include "pixman-inlines.h"
46
47#ifdef VERBOSE
48#define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
49#else
50#define CHECKPOINT()
51#endif
52
53#if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8
54/* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this.  */
55extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
56_mm_empty (void)
57{
58
59}
60#endif
61
62#ifdef USE_X86_MMX
63# if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
64#  include <xmmintrin.h>
65# else
66/* We have to compile with -msse to use xmmintrin.h, but that causes SSE
67 * instructions to be generated that we don't want. Just duplicate the
68 * functions we want to use.  */
69extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
70_mm_movemask_pi8 (__m64 __A)
71{
72    int ret;
73
74    asm ("pmovmskb %1, %0\n\t"
75	: "=r" (ret)
76	: "y" (__A)
77    );
78
79    return ret;
80}
81
82extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
83_mm_mulhi_pu16 (__m64 __A, __m64 __B)
84{
85    asm ("pmulhuw %1, %0\n\t"
86	: "+y" (__A)
87	: "y" (__B)
88    );
89    return __A;
90}
91
92#  ifdef __OPTIMIZE__
93extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
94_mm_shuffle_pi16 (__m64 __A, int8_t const __N)
95{
96    __m64 ret;
97
98    asm ("pshufw %2, %1, %0\n\t"
99	: "=y" (ret)
100	: "y" (__A), "K" (__N)
101    );
102
103    return ret;
104}
105#  else
106#   define _mm_shuffle_pi16(A, N)					\
107    ({									\
108	__m64 ret;							\
109									\
110	asm ("pshufw %2, %1, %0\n\t"					\
111	     : "=y" (ret)						\
112	     : "y" (A), "K" ((const int8_t)N)				\
113	);								\
114									\
115	ret;								\
116    })
117#  endif
118# endif
119#endif
120
121#ifndef _MSC_VER
122#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
123 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
124#endif
125
126/* Notes about writing mmx code
127 *
128 * give memory operands as the second operand. If you give it as the
129 * first, gcc will first load it into a register, then use that
130 * register
131 *
132 *   ie. use
133 *
134 *         _mm_mullo_pi16 (x, mmx_constant);
135 *
136 *   not
137 *
138 *         _mm_mullo_pi16 (mmx_constant, x);
139 *
140 * Also try to minimize dependencies. i.e. when you need a value, try
141 * to calculate it from a value that was calculated as early as
142 * possible.
143 */
144
145/* --------------- MMX primitives ------------------------------------- */
146
147/* If __m64 is defined as a struct or union, then define M64_MEMBER to be
148 * the name of the member used to access the data.
149 * If __m64 requires using mm_cvt* intrinsics functions to convert between
150 * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
151 * If __m64 and uint64_t values can just be cast to each other directly,
152 * then define USE_M64_CASTS.
153 * If __m64 is a double datatype, then define USE_M64_DOUBLE.
154 */
155#ifdef _MSC_VER
156# define M64_MEMBER m64_u64
157#elif defined(__ICC)
158# define USE_CVT_INTRINSICS
159#elif defined(USE_LOONGSON_MMI)
160# define USE_M64_DOUBLE
161#elif defined(__GNUC__)
162# define USE_M64_CASTS
163#elif defined(__SUNPRO_C)
164# if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
165/* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
166 * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
167 * is defined.   If it is used, then the mm_cvt* intrinsics must be used.
168 */
169#  define USE_CVT_INTRINSICS
170# else
171/* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
172 * disabled, __m64 is defined as a struct containing "unsigned long long l_".
173 */
174#  define M64_MEMBER l_
175# endif
176#endif
177
178#if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
179typedef uint64_t mmxdatafield;
180#else
181typedef __m64 mmxdatafield;
182#endif
183
184typedef struct
185{
186    mmxdatafield mmx_4x00ff;
187    mmxdatafield mmx_4x0080;
188    mmxdatafield mmx_565_rgb;
189    mmxdatafield mmx_565_unpack_multiplier;
190    mmxdatafield mmx_565_pack_multiplier;
191    mmxdatafield mmx_565_r;
192    mmxdatafield mmx_565_g;
193    mmxdatafield mmx_565_b;
194    mmxdatafield mmx_packed_565_rb;
195    mmxdatafield mmx_packed_565_g;
196    mmxdatafield mmx_expand_565_g;
197    mmxdatafield mmx_expand_565_b;
198    mmxdatafield mmx_expand_565_r;
199#ifndef USE_LOONGSON_MMI
200    mmxdatafield mmx_mask_0;
201    mmxdatafield mmx_mask_1;
202    mmxdatafield mmx_mask_2;
203    mmxdatafield mmx_mask_3;
204#endif
205    mmxdatafield mmx_full_alpha;
206    mmxdatafield mmx_4x0101;
207    mmxdatafield mmx_ff000000;
208} mmx_data_t;
209
210#if defined(_MSC_VER)
211# define MMXDATA_INIT(field, val) { val ## UI64 }
212#elif defined(M64_MEMBER)       /* __m64 is a struct, not an integral type */
213# define MMXDATA_INIT(field, val) field =   { val ## ULL }
214#else                           /* mmxdatafield is an integral type */
215# define MMXDATA_INIT(field, val) field =   val ## ULL
216#endif
217
218static const mmx_data_t c =
219{
220    MMXDATA_INIT (.mmx_4x00ff,                   0x00ff00ff00ff00ff),
221    MMXDATA_INIT (.mmx_4x0080,                   0x0080008000800080),
222    MMXDATA_INIT (.mmx_565_rgb,                  0x000001f0003f001f),
223    MMXDATA_INIT (.mmx_565_unpack_multiplier,    0x0000008404100840),
224    MMXDATA_INIT (.mmx_565_pack_multiplier,      0x2000000420000004),
225    MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000),
226    MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000),
227    MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
228    MMXDATA_INIT (.mmx_packed_565_rb,            0x00f800f800f800f8),
229    MMXDATA_INIT (.mmx_packed_565_g,             0x0000fc000000fc00),
230    MMXDATA_INIT (.mmx_expand_565_g,             0x07e007e007e007e0),
231    MMXDATA_INIT (.mmx_expand_565_b,             0x001f001f001f001f),
232    MMXDATA_INIT (.mmx_expand_565_r,             0xf800f800f800f800),
233#ifndef USE_LOONGSON_MMI
234    MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
235    MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
236    MMXDATA_INIT (.mmx_mask_2,                   0xffff0000ffffffff),
237    MMXDATA_INIT (.mmx_mask_3,                   0x0000ffffffffffff),
238#endif
239    MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000),
240    MMXDATA_INIT (.mmx_4x0101,                   0x0101010101010101),
241    MMXDATA_INIT (.mmx_ff000000,                 0xff000000ff000000),
242};
243
244#ifdef USE_CVT_INTRINSICS
245#    define MC(x) to_m64 (c.mmx_ ## x)
246#elif defined(USE_M64_CASTS)
247#    define MC(x) ((__m64)c.mmx_ ## x)
248#elif defined(USE_M64_DOUBLE)
249#    define MC(x) (*(__m64 *)&c.mmx_ ## x)
250#else
251#    define MC(x) c.mmx_ ## x
252#endif
253
254static force_inline __m64
255to_m64 (uint64_t x)
256{
257#ifdef USE_CVT_INTRINSICS
258    return _mm_cvtsi64_m64 (x);
259#elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
260    __m64 res;
261
262    res.M64_MEMBER = x;
263    return res;
264#elif defined USE_M64_DOUBLE
265    return *(__m64 *)&x;
266#else /* USE_M64_CASTS */
267    return (__m64)x;
268#endif
269}
270
271static force_inline uint64_t
272to_uint64 (__m64 x)
273{
274#ifdef USE_CVT_INTRINSICS
275    return _mm_cvtm64_si64 (x);
276#elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
277    uint64_t res = x.M64_MEMBER;
278    return res;
279#elif defined USE_M64_DOUBLE
280    return *(uint64_t *)&x;
281#else /* USE_M64_CASTS */
282    return (uint64_t)x;
283#endif
284}
285
286static force_inline __m64
287shift (__m64 v,
288       int   s)
289{
290    if (s > 0)
291	return _mm_slli_si64 (v, s);
292    else if (s < 0)
293	return _mm_srli_si64 (v, -s);
294    else
295	return v;
296}
297
298static force_inline __m64
299negate (__m64 mask)
300{
301    return _mm_xor_si64 (mask, MC (4x00ff));
302}
303
304static force_inline __m64
305pix_multiply (__m64 a, __m64 b)
306{
307    __m64 res;
308
309    res = _mm_mullo_pi16 (a, b);
310    res = _mm_adds_pu16 (res, MC (4x0080));
311    res = _mm_mulhi_pu16 (res, MC (4x0101));
312
313    return res;
314}
315
316static force_inline __m64
317pix_add (__m64 a, __m64 b)
318{
319    return _mm_adds_pu8 (a, b);
320}
321
322static force_inline __m64
323expand_alpha (__m64 pixel)
324{
325    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
326}
327
328static force_inline __m64
329expand_alpha_rev (__m64 pixel)
330{
331    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
332}
333
334static force_inline __m64
335invert_colors (__m64 pixel)
336{
337    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
338}
339
340static force_inline __m64
341over (__m64 src,
342      __m64 srca,
343      __m64 dest)
344{
345    return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
346}
347
348static force_inline __m64
349over_rev_non_pre (__m64 src, __m64 dest)
350{
351    __m64 srca = expand_alpha (src);
352    __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
353
354    return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
355}
356
357static force_inline __m64
358in (__m64 src, __m64 mask)
359{
360    return pix_multiply (src, mask);
361}
362
363#ifndef _MSC_VER
364static force_inline __m64
365in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
366{
367    return over (in (src, mask), pix_multiply (srca, mask), dest);
368}
369
370#else
371
372#define in_over(src, srca, mask, dest)					\
373    over (in (src, mask), pix_multiply (srca, mask), dest)
374
375#endif
376
377/* Elemental unaligned loads */
378
379static force_inline __m64 ldq_u(__m64 *p)
380{
381#ifdef USE_X86_MMX
382    /* x86's alignment restrictions are very relaxed. */
383    return *(__m64 *)p;
384#elif defined USE_ARM_IWMMXT
385    int align = (uintptr_t)p & 7;
386    __m64 *aligned_p;
387    if (align == 0)
388	return *p;
389    aligned_p = (__m64 *)((uintptr_t)p & ~7);
390    return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
391#else
392    struct __una_u64 { __m64 x __attribute__((packed)); };
393    const struct __una_u64 *ptr = (const struct __una_u64 *) p;
394    return (__m64) ptr->x;
395#endif
396}
397
398static force_inline uint32_t ldl_u(const uint32_t *p)
399{
400#ifdef USE_X86_MMX
401    /* x86's alignment restrictions are very relaxed. */
402    return *p;
403#else
404    struct __una_u32 { uint32_t x __attribute__((packed)); };
405    const struct __una_u32 *ptr = (const struct __una_u32 *) p;
406    return ptr->x;
407#endif
408}
409
410static force_inline __m64
411load (const uint32_t *v)
412{
413#ifdef USE_LOONGSON_MMI
414    __m64 ret;
415    asm ("lwc1 %0, %1\n\t"
416	: "=f" (ret)
417	: "m" (*v)
418    );
419    return ret;
420#else
421    return _mm_cvtsi32_si64 (*v);
422#endif
423}
424
425static force_inline __m64
426load8888 (const uint32_t *v)
427{
428#ifdef USE_LOONGSON_MMI
429    return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
430#else
431    return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
432#endif
433}
434
435static force_inline __m64
436load8888u (const uint32_t *v)
437{
438    uint32_t l = ldl_u (v);
439    return load8888 (&l);
440}
441
442static force_inline __m64
443pack8888 (__m64 lo, __m64 hi)
444{
445    return _mm_packs_pu16 (lo, hi);
446}
447
448static force_inline void
449store (uint32_t *dest, __m64 v)
450{
451#ifdef USE_LOONGSON_MMI
452    asm ("swc1 %1, %0\n\t"
453	: "=m" (*dest)
454	: "f" (v)
455	: "memory"
456    );
457#else
458    *dest = _mm_cvtsi64_si32 (v);
459#endif
460}
461
462static force_inline void
463store8888 (uint32_t *dest, __m64 v)
464{
465    v = pack8888 (v, _mm_setzero_si64 ());
466    store (dest, v);
467}
468
469static force_inline pixman_bool_t
470is_equal (__m64 a, __m64 b)
471{
472#ifdef USE_LOONGSON_MMI
473    /* __m64 is double, we can compare directly. */
474    return a == b;
475#else
476    return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
477#endif
478}
479
480static force_inline pixman_bool_t
481is_opaque (__m64 v)
482{
483#ifdef USE_LOONGSON_MMI
484    return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
485#else
486    __m64 ffs = _mm_cmpeq_pi8 (v, v);
487    return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
488#endif
489}
490
491static force_inline pixman_bool_t
492is_zero (__m64 v)
493{
494    return is_equal (v, _mm_setzero_si64 ());
495}
496
497/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
498 *
499 *    00RR00GG00BB
500 *
501 * --- Expanding 565 in the low word ---
502 *
503 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
504 * m = m & (01f0003f001f);
505 * m = m * (008404100840);
506 * m = m >> 8;
507 *
508 * Note the trick here - the top word is shifted by another nibble to
509 * avoid it bumping into the middle word
510 */
511static force_inline __m64
512expand565 (__m64 pixel, int pos)
513{
514    __m64 p = pixel;
515    __m64 t1, t2;
516
517    /* move pixel to low 16 bit and zero the rest */
518#ifdef USE_LOONGSON_MMI
519    p = loongson_extract_pi16 (p, pos);
520#else
521    p = shift (shift (p, (3 - pos) * 16), -48);
522#endif
523
524    t1 = shift (p, 36 - 11);
525    t2 = shift (p, 16 - 5);
526
527    p = _mm_or_si64 (t1, p);
528    p = _mm_or_si64 (t2, p);
529    p = _mm_and_si64 (p, MC (565_rgb));
530
531    pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
532    return _mm_srli_pi16 (pixel, 8);
533}
534
535/* Expand 4 16 bit pixels in an mmx register into two mmx registers of
536 *
537 *    AARRGGBBRRGGBB
538 */
539static force_inline void
540expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
541{
542    __m64 t0, t1, alpha = _mm_setzero_si64 ();
543    __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
544    __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
545    __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
546    if (full_alpha)
547	alpha = _mm_cmpeq_pi32 (alpha, alpha);
548
549    /* Replicate high bits into empty low bits. */
550    r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
551    g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
552    b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
553
554    r = _mm_packs_pu16 (r, _mm_setzero_si64 ());	/* 00 00 00 00 R3 R2 R1 R0 */
555    g = _mm_packs_pu16 (g, _mm_setzero_si64 ());	/* 00 00 00 00 G3 G2 G1 G0 */
556    b = _mm_packs_pu16 (b, _mm_setzero_si64 ());	/* 00 00 00 00 B3 B2 B1 B0 */
557
558    t1 = _mm_unpacklo_pi8 (r, alpha);			/* A3 R3 A2 R2 A1 R1 A0 R0 */
559    t0 = _mm_unpacklo_pi8 (b, g);			/* G3 B3 G2 B2 G1 B1 G0 B0 */
560
561    *vout0 = _mm_unpacklo_pi16 (t0, t1);		/* A1 R1 G1 B1 A0 R0 G0 B0 */
562    *vout1 = _mm_unpackhi_pi16 (t0, t1);		/* A3 R3 G3 B3 A2 R2 G2 B2 */
563}
564
565static force_inline __m64
566expand8888 (__m64 in, int pos)
567{
568    if (pos == 0)
569	return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
570    else
571	return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
572}
573
574static force_inline __m64
575expandx888 (__m64 in, int pos)
576{
577    return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
578}
579
580static force_inline void
581expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
582{
583    __m64 v0, v1;
584    expand_4xpacked565 (vin, &v0, &v1, full_alpha);
585    *vout0 = expand8888 (v0, 0);
586    *vout1 = expand8888 (v0, 1);
587    *vout2 = expand8888 (v1, 0);
588    *vout3 = expand8888 (v1, 1);
589}
590
591static force_inline __m64
592pack_565 (__m64 pixel, __m64 target, int pos)
593{
594    __m64 p = pixel;
595    __m64 t = target;
596    __m64 r, g, b;
597
598    r = _mm_and_si64 (p, MC (565_r));
599    g = _mm_and_si64 (p, MC (565_g));
600    b = _mm_and_si64 (p, MC (565_b));
601
602#ifdef USE_LOONGSON_MMI
603    r = shift (r, -(32 - 8));
604    g = shift (g, -(16 - 3));
605    b = shift (b, -(0  + 3));
606
607    p = _mm_or_si64 (r, g);
608    p = _mm_or_si64 (p, b);
609    return loongson_insert_pi16 (t, p, pos);
610#else
611    r = shift (r, -(32 - 8) + pos * 16);
612    g = shift (g, -(16 - 3) + pos * 16);
613    b = shift (b, -(0  + 3) + pos * 16);
614
615    if (pos == 0)
616	t = _mm_and_si64 (t, MC (mask_0));
617    else if (pos == 1)
618	t = _mm_and_si64 (t, MC (mask_1));
619    else if (pos == 2)
620	t = _mm_and_si64 (t, MC (mask_2));
621    else if (pos == 3)
622	t = _mm_and_si64 (t, MC (mask_3));
623
624    p = _mm_or_si64 (r, t);
625    p = _mm_or_si64 (g, p);
626
627    return _mm_or_si64 (b, p);
628#endif
629}
630
631static force_inline __m64
632pack_4xpacked565 (__m64 a, __m64 b)
633{
634    __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
635    __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
636
637    __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
638    __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
639
640    __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
641    __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
642
643    t0 = _mm_or_si64 (t0, g0);
644    t1 = _mm_or_si64 (t1, g1);
645
646    t0 = shift(t0, -5);
647#ifdef USE_ARM_IWMMXT
648    t1 = shift(t1, -5);
649    return _mm_packs_pu32 (t0, t1);
650#else
651    t1 = shift(t1, -5 + 16);
652    return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
653#endif
654}
655
656#ifndef _MSC_VER
657
658static force_inline __m64
659pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
660{
661    return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
662}
663
664static force_inline __m64
665pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
666{
667    x = pix_multiply (x, a);
668    y = pix_multiply (y, b);
669
670    return pix_add (x, y);
671}
672
673#else
674
675/* MSVC only handles a "pass by register" of up to three SSE intrinsics */
676
677#define pack_4x565(v0, v1, v2, v3) \
678    pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
679
680#define pix_add_mul(x, a, y, b)	 \
681    ( x = pix_multiply (x, a),	 \
682      y = pix_multiply (y, b),	 \
683      pix_add (x, y) )
684
685#endif
686
687/* --------------- MMX code patch for fbcompose.c --------------------- */
688
689static force_inline __m64
690combine (const uint32_t *src, const uint32_t *mask)
691{
692    __m64 vsrc = load8888 (src);
693
694    if (mask)
695    {
696	__m64 m = load8888 (mask);
697
698	m = expand_alpha (m);
699	vsrc = pix_multiply (vsrc, m);
700    }
701
702    return vsrc;
703}
704
705static force_inline __m64
706core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
707{
708    vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
709
710    if (is_opaque (vsrc))
711    {
712	return vsrc;
713    }
714    else if (!is_zero (vsrc))
715    {
716	return over (vsrc, expand_alpha (vsrc),
717		     _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
718    }
719
720    return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
721}
722
723static void
724mmx_combine_over_u (pixman_implementation_t *imp,
725                    pixman_op_t              op,
726                    uint32_t *               dest,
727                    const uint32_t *         src,
728                    const uint32_t *         mask,
729                    int                      width)
730{
731    const uint32_t *end = dest + width;
732
733    while (dest < end)
734    {
735	__m64 vsrc = combine (src, mask);
736
737	if (is_opaque (vsrc))
738	{
739	    store8888 (dest, vsrc);
740	}
741	else if (!is_zero (vsrc))
742	{
743	    __m64 sa = expand_alpha (vsrc);
744	    store8888 (dest, over (vsrc, sa, load8888 (dest)));
745	}
746
747	++dest;
748	++src;
749	if (mask)
750	    ++mask;
751    }
752    _mm_empty ();
753}
754
755static void
756mmx_combine_over_reverse_u (pixman_implementation_t *imp,
757                            pixman_op_t              op,
758                            uint32_t *               dest,
759                            const uint32_t *         src,
760                            const uint32_t *         mask,
761                            int                      width)
762{
763    const uint32_t *end = dest + width;
764
765    while (dest < end)
766    {
767	__m64 d, da;
768	__m64 s = combine (src, mask);
769
770	d = load8888 (dest);
771	da = expand_alpha (d);
772	store8888 (dest, over (d, da, s));
773
774	++dest;
775	++src;
776	if (mask)
777	    mask++;
778    }
779    _mm_empty ();
780}
781
782static void
783mmx_combine_in_u (pixman_implementation_t *imp,
784                  pixman_op_t              op,
785                  uint32_t *               dest,
786                  const uint32_t *         src,
787                  const uint32_t *         mask,
788                  int                      width)
789{
790    const uint32_t *end = dest + width;
791
792    while (dest < end)
793    {
794	__m64 a;
795	__m64 x = combine (src, mask);
796
797	a = load8888 (dest);
798	a = expand_alpha (a);
799	x = pix_multiply (x, a);
800
801	store8888 (dest, x);
802
803	++dest;
804	++src;
805	if (mask)
806	    mask++;
807    }
808    _mm_empty ();
809}
810
811static void
812mmx_combine_in_reverse_u (pixman_implementation_t *imp,
813                          pixman_op_t              op,
814                          uint32_t *               dest,
815                          const uint32_t *         src,
816                          const uint32_t *         mask,
817                          int                      width)
818{
819    const uint32_t *end = dest + width;
820
821    while (dest < end)
822    {
823	__m64 a = combine (src, mask);
824	__m64 x;
825
826	x = load8888 (dest);
827	a = expand_alpha (a);
828	x = pix_multiply (x, a);
829	store8888 (dest, x);
830
831	++dest;
832	++src;
833	if (mask)
834	    mask++;
835    }
836    _mm_empty ();
837}
838
839static void
840mmx_combine_out_u (pixman_implementation_t *imp,
841                   pixman_op_t              op,
842                   uint32_t *               dest,
843                   const uint32_t *         src,
844                   const uint32_t *         mask,
845                   int                      width)
846{
847    const uint32_t *end = dest + width;
848
849    while (dest < end)
850    {
851	__m64 a;
852	__m64 x = combine (src, mask);
853
854	a = load8888 (dest);
855	a = expand_alpha (a);
856	a = negate (a);
857	x = pix_multiply (x, a);
858	store8888 (dest, x);
859
860	++dest;
861	++src;
862	if (mask)
863	    mask++;
864    }
865    _mm_empty ();
866}
867
868static void
869mmx_combine_out_reverse_u (pixman_implementation_t *imp,
870                           pixman_op_t              op,
871                           uint32_t *               dest,
872                           const uint32_t *         src,
873                           const uint32_t *         mask,
874                           int                      width)
875{
876    const uint32_t *end = dest + width;
877
878    while (dest < end)
879    {
880	__m64 a = combine (src, mask);
881	__m64 x;
882
883	x = load8888 (dest);
884	a = expand_alpha (a);
885	a = negate (a);
886	x = pix_multiply (x, a);
887
888	store8888 (dest, x);
889
890	++dest;
891	++src;
892	if (mask)
893	    mask++;
894    }
895    _mm_empty ();
896}
897
898static void
899mmx_combine_atop_u (pixman_implementation_t *imp,
900                    pixman_op_t              op,
901                    uint32_t *               dest,
902                    const uint32_t *         src,
903                    const uint32_t *         mask,
904                    int                      width)
905{
906    const uint32_t *end = dest + width;
907
908    while (dest < end)
909    {
910	__m64 da, d, sia;
911	__m64 s = combine (src, mask);
912
913	d = load8888 (dest);
914	sia = expand_alpha (s);
915	sia = negate (sia);
916	da = expand_alpha (d);
917	s = pix_add_mul (s, da, d, sia);
918	store8888 (dest, s);
919
920	++dest;
921	++src;
922	if (mask)
923	    mask++;
924    }
925    _mm_empty ();
926}
927
928static void
929mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
930                            pixman_op_t              op,
931                            uint32_t *               dest,
932                            const uint32_t *         src,
933                            const uint32_t *         mask,
934                            int                      width)
935{
936    const uint32_t *end;
937
938    end = dest + width;
939
940    while (dest < end)
941    {
942	__m64 dia, d, sa;
943	__m64 s = combine (src, mask);
944
945	d = load8888 (dest);
946	sa = expand_alpha (s);
947	dia = expand_alpha (d);
948	dia = negate (dia);
949	s = pix_add_mul (s, dia, d, sa);
950	store8888 (dest, s);
951
952	++dest;
953	++src;
954	if (mask)
955	    mask++;
956    }
957    _mm_empty ();
958}
959
960static void
961mmx_combine_xor_u (pixman_implementation_t *imp,
962                   pixman_op_t              op,
963                   uint32_t *               dest,
964                   const uint32_t *         src,
965                   const uint32_t *         mask,
966                   int                      width)
967{
968    const uint32_t *end = dest + width;
969
970    while (dest < end)
971    {
972	__m64 dia, d, sia;
973	__m64 s = combine (src, mask);
974
975	d = load8888 (dest);
976	sia = expand_alpha (s);
977	dia = expand_alpha (d);
978	sia = negate (sia);
979	dia = negate (dia);
980	s = pix_add_mul (s, dia, d, sia);
981	store8888 (dest, s);
982
983	++dest;
984	++src;
985	if (mask)
986	    mask++;
987    }
988    _mm_empty ();
989}
990
991static void
992mmx_combine_add_u (pixman_implementation_t *imp,
993                   pixman_op_t              op,
994                   uint32_t *               dest,
995                   const uint32_t *         src,
996                   const uint32_t *         mask,
997                   int                      width)
998{
999    const uint32_t *end = dest + width;
1000
1001    while (dest < end)
1002    {
1003	__m64 d;
1004	__m64 s = combine (src, mask);
1005
1006	d = load8888 (dest);
1007	s = pix_add (s, d);
1008	store8888 (dest, s);
1009
1010	++dest;
1011	++src;
1012	if (mask)
1013	    mask++;
1014    }
1015    _mm_empty ();
1016}
1017
1018static void
1019mmx_combine_saturate_u (pixman_implementation_t *imp,
1020                        pixman_op_t              op,
1021                        uint32_t *               dest,
1022                        const uint32_t *         src,
1023                        const uint32_t *         mask,
1024                        int                      width)
1025{
1026    const uint32_t *end = dest + width;
1027
1028    while (dest < end)
1029    {
1030	uint32_t s, sa, da;
1031	uint32_t d = *dest;
1032	__m64 ms = combine (src, mask);
1033	__m64 md = load8888 (dest);
1034
1035	store8888(&s, ms);
1036	da = ~d >> 24;
1037	sa = s >> 24;
1038
1039	if (sa > da)
1040	{
1041	    uint32_t quot = DIV_UN8 (da, sa) << 24;
1042	    __m64 msa = load8888 (&quot);
1043	    msa = expand_alpha (msa);
1044	    ms = pix_multiply (ms, msa);
1045	}
1046
1047	md = pix_add (md, ms);
1048	store8888 (dest, md);
1049
1050	++src;
1051	++dest;
1052	if (mask)
1053	    mask++;
1054    }
1055    _mm_empty ();
1056}
1057
1058static void
1059mmx_combine_src_ca (pixman_implementation_t *imp,
1060                    pixman_op_t              op,
1061                    uint32_t *               dest,
1062                    const uint32_t *         src,
1063                    const uint32_t *         mask,
1064                    int                      width)
1065{
1066    const uint32_t *end = src + width;
1067
1068    while (src < end)
1069    {
1070	__m64 a = load8888 (mask);
1071	__m64 s = load8888 (src);
1072
1073	s = pix_multiply (s, a);
1074	store8888 (dest, s);
1075
1076	++src;
1077	++mask;
1078	++dest;
1079    }
1080    _mm_empty ();
1081}
1082
1083static void
1084mmx_combine_over_ca (pixman_implementation_t *imp,
1085                     pixman_op_t              op,
1086                     uint32_t *               dest,
1087                     const uint32_t *         src,
1088                     const uint32_t *         mask,
1089                     int                      width)
1090{
1091    const uint32_t *end = src + width;
1092
1093    while (src < end)
1094    {
1095	__m64 a = load8888 (mask);
1096	__m64 s = load8888 (src);
1097	__m64 d = load8888 (dest);
1098	__m64 sa = expand_alpha (s);
1099
1100	store8888 (dest, in_over (s, sa, a, d));
1101
1102	++src;
1103	++dest;
1104	++mask;
1105    }
1106    _mm_empty ();
1107}
1108
1109static void
1110mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
1111                             pixman_op_t              op,
1112                             uint32_t *               dest,
1113                             const uint32_t *         src,
1114                             const uint32_t *         mask,
1115                             int                      width)
1116{
1117    const uint32_t *end = src + width;
1118
1119    while (src < end)
1120    {
1121	__m64 a = load8888 (mask);
1122	__m64 s = load8888 (src);
1123	__m64 d = load8888 (dest);
1124	__m64 da = expand_alpha (d);
1125
1126	store8888 (dest, over (d, da, in (s, a)));
1127
1128	++src;
1129	++dest;
1130	++mask;
1131    }
1132    _mm_empty ();
1133}
1134
1135static void
1136mmx_combine_in_ca (pixman_implementation_t *imp,
1137                   pixman_op_t              op,
1138                   uint32_t *               dest,
1139                   const uint32_t *         src,
1140                   const uint32_t *         mask,
1141                   int                      width)
1142{
1143    const uint32_t *end = src + width;
1144
1145    while (src < end)
1146    {
1147	__m64 a = load8888 (mask);
1148	__m64 s = load8888 (src);
1149	__m64 d = load8888 (dest);
1150	__m64 da = expand_alpha (d);
1151
1152	s = pix_multiply (s, a);
1153	s = pix_multiply (s, da);
1154	store8888 (dest, s);
1155
1156	++src;
1157	++dest;
1158	++mask;
1159    }
1160    _mm_empty ();
1161}
1162
1163static void
1164mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1165                           pixman_op_t              op,
1166                           uint32_t *               dest,
1167                           const uint32_t *         src,
1168                           const uint32_t *         mask,
1169                           int                      width)
1170{
1171    const uint32_t *end = src + width;
1172
1173    while (src < end)
1174    {
1175	__m64 a = load8888 (mask);
1176	__m64 s = load8888 (src);
1177	__m64 d = load8888 (dest);
1178	__m64 sa = expand_alpha (s);
1179
1180	a = pix_multiply (a, sa);
1181	d = pix_multiply (d, a);
1182	store8888 (dest, d);
1183
1184	++src;
1185	++dest;
1186	++mask;
1187    }
1188    _mm_empty ();
1189}
1190
1191static void
1192mmx_combine_out_ca (pixman_implementation_t *imp,
1193                    pixman_op_t              op,
1194                    uint32_t *               dest,
1195                    const uint32_t *         src,
1196                    const uint32_t *         mask,
1197                    int                      width)
1198{
1199    const uint32_t *end = src + width;
1200
1201    while (src < end)
1202    {
1203	__m64 a = load8888 (mask);
1204	__m64 s = load8888 (src);
1205	__m64 d = load8888 (dest);
1206	__m64 da = expand_alpha (d);
1207
1208	da = negate (da);
1209	s = pix_multiply (s, a);
1210	s = pix_multiply (s, da);
1211	store8888 (dest, s);
1212
1213	++src;
1214	++dest;
1215	++mask;
1216    }
1217    _mm_empty ();
1218}
1219
1220static void
1221mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
1222                            pixman_op_t              op,
1223                            uint32_t *               dest,
1224                            const uint32_t *         src,
1225                            const uint32_t *         mask,
1226                            int                      width)
1227{
1228    const uint32_t *end = src + width;
1229
1230    while (src < end)
1231    {
1232	__m64 a = load8888 (mask);
1233	__m64 s = load8888 (src);
1234	__m64 d = load8888 (dest);
1235	__m64 sa = expand_alpha (s);
1236
1237	a = pix_multiply (a, sa);
1238	a = negate (a);
1239	d = pix_multiply (d, a);
1240	store8888 (dest, d);
1241
1242	++src;
1243	++dest;
1244	++mask;
1245    }
1246    _mm_empty ();
1247}
1248
1249static void
1250mmx_combine_atop_ca (pixman_implementation_t *imp,
1251                     pixman_op_t              op,
1252                     uint32_t *               dest,
1253                     const uint32_t *         src,
1254                     const uint32_t *         mask,
1255                     int                      width)
1256{
1257    const uint32_t *end = src + width;
1258
1259    while (src < end)
1260    {
1261	__m64 a = load8888 (mask);
1262	__m64 s = load8888 (src);
1263	__m64 d = load8888 (dest);
1264	__m64 da = expand_alpha (d);
1265	__m64 sa = expand_alpha (s);
1266
1267	s = pix_multiply (s, a);
1268	a = pix_multiply (a, sa);
1269	a = negate (a);
1270	d = pix_add_mul (d, a, s, da);
1271	store8888 (dest, d);
1272
1273	++src;
1274	++dest;
1275	++mask;
1276    }
1277    _mm_empty ();
1278}
1279
1280static void
1281mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1282                             pixman_op_t              op,
1283                             uint32_t *               dest,
1284                             const uint32_t *         src,
1285                             const uint32_t *         mask,
1286                             int                      width)
1287{
1288    const uint32_t *end = src + width;
1289
1290    while (src < end)
1291    {
1292	__m64 a = load8888 (mask);
1293	__m64 s = load8888 (src);
1294	__m64 d = load8888 (dest);
1295	__m64 da = expand_alpha (d);
1296	__m64 sa = expand_alpha (s);
1297
1298	s = pix_multiply (s, a);
1299	a = pix_multiply (a, sa);
1300	da = negate (da);
1301	d = pix_add_mul (d, a, s, da);
1302	store8888 (dest, d);
1303
1304	++src;
1305	++dest;
1306	++mask;
1307    }
1308    _mm_empty ();
1309}
1310
1311static void
1312mmx_combine_xor_ca (pixman_implementation_t *imp,
1313                    pixman_op_t              op,
1314                    uint32_t *               dest,
1315                    const uint32_t *         src,
1316                    const uint32_t *         mask,
1317                    int                      width)
1318{
1319    const uint32_t *end = src + width;
1320
1321    while (src < end)
1322    {
1323	__m64 a = load8888 (mask);
1324	__m64 s = load8888 (src);
1325	__m64 d = load8888 (dest);
1326	__m64 da = expand_alpha (d);
1327	__m64 sa = expand_alpha (s);
1328
1329	s = pix_multiply (s, a);
1330	a = pix_multiply (a, sa);
1331	da = negate (da);
1332	a = negate (a);
1333	d = pix_add_mul (d, a, s, da);
1334	store8888 (dest, d);
1335
1336	++src;
1337	++dest;
1338	++mask;
1339    }
1340    _mm_empty ();
1341}
1342
1343static void
1344mmx_combine_add_ca (pixman_implementation_t *imp,
1345                    pixman_op_t              op,
1346                    uint32_t *               dest,
1347                    const uint32_t *         src,
1348                    const uint32_t *         mask,
1349                    int                      width)
1350{
1351    const uint32_t *end = src + width;
1352
1353    while (src < end)
1354    {
1355	__m64 a = load8888 (mask);
1356	__m64 s = load8888 (src);
1357	__m64 d = load8888 (dest);
1358
1359	s = pix_multiply (s, a);
1360	d = pix_add (s, d);
1361	store8888 (dest, d);
1362
1363	++src;
1364	++dest;
1365	++mask;
1366    }
1367    _mm_empty ();
1368}
1369
1370/* ------------- MMX code paths called from fbpict.c -------------------- */
1371
1372static void
1373mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1374                           pixman_composite_info_t *info)
1375{
1376    PIXMAN_COMPOSITE_ARGS (info);
1377    uint32_t src;
1378    uint32_t    *dst_line, *dst;
1379    int32_t w;
1380    int dst_stride;
1381    __m64 vsrc, vsrca;
1382
1383    CHECKPOINT ();
1384
1385    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1386
1387    if (src == 0)
1388	return;
1389
1390    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1391
1392    vsrc = load8888 (&src);
1393    vsrca = expand_alpha (vsrc);
1394
1395    while (height--)
1396    {
1397	dst = dst_line;
1398	dst_line += dst_stride;
1399	w = width;
1400
1401	CHECKPOINT ();
1402
1403	while (w && (uintptr_t)dst & 7)
1404	{
1405	    store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1406
1407	    w--;
1408	    dst++;
1409	}
1410
1411	while (w >= 2)
1412	{
1413	    __m64 vdest;
1414	    __m64 dest0, dest1;
1415
1416	    vdest = *(__m64 *)dst;
1417
1418	    dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1419	    dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1420
1421	    *(__m64 *)dst = pack8888 (dest0, dest1);
1422
1423	    dst += 2;
1424	    w -= 2;
1425	}
1426
1427	CHECKPOINT ();
1428
1429	if (w)
1430	{
1431	    store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1432	}
1433    }
1434
1435    _mm_empty ();
1436}
1437
1438static void
1439mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1440                           pixman_composite_info_t *info)
1441{
1442    PIXMAN_COMPOSITE_ARGS (info);
1443    uint32_t src;
1444    uint16_t    *dst_line, *dst;
1445    int32_t w;
1446    int dst_stride;
1447    __m64 vsrc, vsrca;
1448
1449    CHECKPOINT ();
1450
1451    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1452
1453    if (src == 0)
1454	return;
1455
1456    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1457
1458    vsrc = load8888 (&src);
1459    vsrca = expand_alpha (vsrc);
1460
1461    while (height--)
1462    {
1463	dst = dst_line;
1464	dst_line += dst_stride;
1465	w = width;
1466
1467	CHECKPOINT ();
1468
1469	while (w && (uintptr_t)dst & 7)
1470	{
1471	    uint64_t d = *dst;
1472	    __m64 vdest = expand565 (to_m64 (d), 0);
1473
1474	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1475	    *dst = to_uint64 (vdest);
1476
1477	    w--;
1478	    dst++;
1479	}
1480
1481	while (w >= 4)
1482	{
1483	    __m64 vdest = *(__m64 *)dst;
1484	    __m64 v0, v1, v2, v3;
1485
1486	    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1487
1488	    v0 = over (vsrc, vsrca, v0);
1489	    v1 = over (vsrc, vsrca, v1);
1490	    v2 = over (vsrc, vsrca, v2);
1491	    v3 = over (vsrc, vsrca, v3);
1492
1493	    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1494
1495	    dst += 4;
1496	    w -= 4;
1497	}
1498
1499	CHECKPOINT ();
1500
1501	while (w)
1502	{
1503	    uint64_t d = *dst;
1504	    __m64 vdest = expand565 (to_m64 (d), 0);
1505
1506	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1507	    *dst = to_uint64 (vdest);
1508
1509	    w--;
1510	    dst++;
1511	}
1512    }
1513
1514    _mm_empty ();
1515}
1516
1517static void
1518mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1519                                   pixman_composite_info_t *info)
1520{
1521    PIXMAN_COMPOSITE_ARGS (info);
1522    uint32_t src;
1523    uint32_t    *dst_line;
1524    uint32_t    *mask_line;
1525    int dst_stride, mask_stride;
1526    __m64 vsrc, vsrca;
1527
1528    CHECKPOINT ();
1529
1530    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1531
1532    if (src == 0)
1533	return;
1534
1535    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1536    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1537
1538    vsrc = load8888 (&src);
1539    vsrca = expand_alpha (vsrc);
1540
1541    while (height--)
1542    {
1543	int twidth = width;
1544	uint32_t *p = (uint32_t *)mask_line;
1545	uint32_t *q = (uint32_t *)dst_line;
1546
1547	while (twidth && (uintptr_t)q & 7)
1548	{
1549	    uint32_t m = *(uint32_t *)p;
1550
1551	    if (m)
1552	    {
1553		__m64 vdest = load8888 (q);
1554		vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1555		store8888 (q, vdest);
1556	    }
1557
1558	    twidth--;
1559	    p++;
1560	    q++;
1561	}
1562
1563	while (twidth >= 2)
1564	{
1565	    uint32_t m0, m1;
1566	    m0 = *p;
1567	    m1 = *(p + 1);
1568
1569	    if (m0 | m1)
1570	    {
1571		__m64 dest0, dest1;
1572		__m64 vdest = *(__m64 *)q;
1573
1574		dest0 = in_over (vsrc, vsrca, load8888 (&m0),
1575		                 expand8888 (vdest, 0));
1576		dest1 = in_over (vsrc, vsrca, load8888 (&m1),
1577		                 expand8888 (vdest, 1));
1578
1579		*(__m64 *)q = pack8888 (dest0, dest1);
1580	    }
1581
1582	    p += 2;
1583	    q += 2;
1584	    twidth -= 2;
1585	}
1586
1587	if (twidth)
1588	{
1589	    uint32_t m = *(uint32_t *)p;
1590
1591	    if (m)
1592	    {
1593		__m64 vdest = load8888 (q);
1594		vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1595		store8888 (q, vdest);
1596	    }
1597
1598	    twidth--;
1599	    p++;
1600	    q++;
1601	}
1602
1603	dst_line += dst_stride;
1604	mask_line += mask_stride;
1605    }
1606
1607    _mm_empty ();
1608}
1609
1610static void
1611mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1612                                pixman_composite_info_t *info)
1613{
1614    PIXMAN_COMPOSITE_ARGS (info);
1615    uint32_t    *dst_line, *dst;
1616    uint32_t    *src_line, *src;
1617    uint32_t mask;
1618    __m64 vmask;
1619    int dst_stride, src_stride;
1620    int32_t w;
1621
1622    CHECKPOINT ();
1623
1624    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1625    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1626
1627    mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1628    vmask = expand_alpha (load8888 (&mask));
1629
1630    while (height--)
1631    {
1632	dst = dst_line;
1633	dst_line += dst_stride;
1634	src = src_line;
1635	src_line += src_stride;
1636	w = width;
1637
1638	while (w && (uintptr_t)dst & 7)
1639	{
1640	    __m64 s = load8888 (src);
1641	    __m64 d = load8888 (dst);
1642
1643	    store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1644
1645	    w--;
1646	    dst++;
1647	    src++;
1648	}
1649
1650	while (w >= 2)
1651	{
1652	    __m64 vs = ldq_u ((__m64 *)src);
1653	    __m64 vd = *(__m64 *)dst;
1654	    __m64 vsrc0 = expand8888 (vs, 0);
1655	    __m64 vsrc1 = expand8888 (vs, 1);
1656
1657	    *(__m64 *)dst = pack8888 (
1658	        in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1659	        in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1660
1661	    w -= 2;
1662	    dst += 2;
1663	    src += 2;
1664	}
1665
1666	if (w)
1667	{
1668	    __m64 s = load8888 (src);
1669	    __m64 d = load8888 (dst);
1670
1671	    store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1672	}
1673    }
1674
1675    _mm_empty ();
1676}
1677
1678static void
1679mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1680                                pixman_composite_info_t *info)
1681{
1682    PIXMAN_COMPOSITE_ARGS (info);
1683    uint32_t *dst_line, *dst;
1684    uint32_t *src_line, *src;
1685    uint32_t mask;
1686    __m64 vmask;
1687    int dst_stride, src_stride;
1688    int32_t w;
1689    __m64 srca;
1690
1691    CHECKPOINT ();
1692
1693    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1694    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1695    mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1696
1697    vmask = expand_alpha (load8888 (&mask));
1698    srca = MC (4x00ff);
1699
1700    while (height--)
1701    {
1702	dst = dst_line;
1703	dst_line += dst_stride;
1704	src = src_line;
1705	src_line += src_stride;
1706	w = width;
1707
1708	while (w && (uintptr_t)dst & 7)
1709	{
1710	    uint32_t ssrc = *src | 0xff000000;
1711	    __m64 s = load8888 (&ssrc);
1712	    __m64 d = load8888 (dst);
1713
1714	    store8888 (dst, in_over (s, srca, vmask, d));
1715
1716	    w--;
1717	    dst++;
1718	    src++;
1719	}
1720
1721	while (w >= 16)
1722	{
1723	    __m64 vd0 = *(__m64 *)(dst + 0);
1724	    __m64 vd1 = *(__m64 *)(dst + 2);
1725	    __m64 vd2 = *(__m64 *)(dst + 4);
1726	    __m64 vd3 = *(__m64 *)(dst + 6);
1727	    __m64 vd4 = *(__m64 *)(dst + 8);
1728	    __m64 vd5 = *(__m64 *)(dst + 10);
1729	    __m64 vd6 = *(__m64 *)(dst + 12);
1730	    __m64 vd7 = *(__m64 *)(dst + 14);
1731
1732	    __m64 vs0 = ldq_u ((__m64 *)(src + 0));
1733	    __m64 vs1 = ldq_u ((__m64 *)(src + 2));
1734	    __m64 vs2 = ldq_u ((__m64 *)(src + 4));
1735	    __m64 vs3 = ldq_u ((__m64 *)(src + 6));
1736	    __m64 vs4 = ldq_u ((__m64 *)(src + 8));
1737	    __m64 vs5 = ldq_u ((__m64 *)(src + 10));
1738	    __m64 vs6 = ldq_u ((__m64 *)(src + 12));
1739	    __m64 vs7 = ldq_u ((__m64 *)(src + 14));
1740
1741	    vd0 = pack8888 (
1742	        in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1743	        in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1744
1745	    vd1 = pack8888 (
1746	        in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1747	        in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1748
1749	    vd2 = pack8888 (
1750	        in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1751	        in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1752
1753	    vd3 = pack8888 (
1754	        in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1755	        in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1756
1757	    vd4 = pack8888 (
1758	        in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1759	        in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1760
1761	    vd5 = pack8888 (
1762	        in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1763	        in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1764
1765	    vd6 = pack8888 (
1766	        in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1767	        in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1768
1769	    vd7 = pack8888 (
1770	        in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1771	        in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1772
1773	    *(__m64 *)(dst + 0) = vd0;
1774	    *(__m64 *)(dst + 2) = vd1;
1775	    *(__m64 *)(dst + 4) = vd2;
1776	    *(__m64 *)(dst + 6) = vd3;
1777	    *(__m64 *)(dst + 8) = vd4;
1778	    *(__m64 *)(dst + 10) = vd5;
1779	    *(__m64 *)(dst + 12) = vd6;
1780	    *(__m64 *)(dst + 14) = vd7;
1781
1782	    w -= 16;
1783	    dst += 16;
1784	    src += 16;
1785	}
1786
1787	while (w)
1788	{
1789	    uint32_t ssrc = *src | 0xff000000;
1790	    __m64 s = load8888 (&ssrc);
1791	    __m64 d = load8888 (dst);
1792
1793	    store8888 (dst, in_over (s, srca, vmask, d));
1794
1795	    w--;
1796	    dst++;
1797	    src++;
1798	}
1799    }
1800
1801    _mm_empty ();
1802}
1803
1804static void
1805mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1806                              pixman_composite_info_t *info)
1807{
1808    PIXMAN_COMPOSITE_ARGS (info);
1809    uint32_t *dst_line, *dst;
1810    uint32_t *src_line, *src;
1811    uint32_t s;
1812    int dst_stride, src_stride;
1813    uint8_t a;
1814    int32_t w;
1815
1816    CHECKPOINT ();
1817
1818    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1819    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1820
1821    while (height--)
1822    {
1823	dst = dst_line;
1824	dst_line += dst_stride;
1825	src = src_line;
1826	src_line += src_stride;
1827	w = width;
1828
1829	while (w--)
1830	{
1831	    s = *src++;
1832	    a = s >> 24;
1833
1834	    if (a == 0xff)
1835	    {
1836		*dst = s;
1837	    }
1838	    else if (s)
1839	    {
1840		__m64 ms, sa;
1841		ms = load8888 (&s);
1842		sa = expand_alpha (ms);
1843		store8888 (dst, over (ms, sa, load8888 (dst)));
1844	    }
1845
1846	    dst++;
1847	}
1848    }
1849    _mm_empty ();
1850}
1851
1852static void
1853mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1854                              pixman_composite_info_t *info)
1855{
1856    PIXMAN_COMPOSITE_ARGS (info);
1857    uint16_t    *dst_line, *dst;
1858    uint32_t    *src_line, *src;
1859    int dst_stride, src_stride;
1860    int32_t w;
1861
1862    CHECKPOINT ();
1863
1864    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1865    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1866
1867#if 0
1868    /* FIXME */
1869    assert (src_image->drawable == mask_image->drawable);
1870#endif
1871
1872    while (height--)
1873    {
1874	dst = dst_line;
1875	dst_line += dst_stride;
1876	src = src_line;
1877	src_line += src_stride;
1878	w = width;
1879
1880	CHECKPOINT ();
1881
1882	while (w && (uintptr_t)dst & 7)
1883	{
1884	    __m64 vsrc = load8888 (src);
1885	    uint64_t d = *dst;
1886	    __m64 vdest = expand565 (to_m64 (d), 0);
1887
1888	    vdest = pack_565 (
1889		over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1890
1891	    *dst = to_uint64 (vdest);
1892
1893	    w--;
1894	    dst++;
1895	    src++;
1896	}
1897
1898	CHECKPOINT ();
1899
1900	while (w >= 4)
1901	{
1902	    __m64 vdest = *(__m64 *)dst;
1903	    __m64 v0, v1, v2, v3;
1904	    __m64 vsrc0, vsrc1, vsrc2, vsrc3;
1905
1906	    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1907
1908	    vsrc0 = load8888 ((src + 0));
1909	    vsrc1 = load8888 ((src + 1));
1910	    vsrc2 = load8888 ((src + 2));
1911	    vsrc3 = load8888 ((src + 3));
1912
1913	    v0 = over (vsrc0, expand_alpha (vsrc0), v0);
1914	    v1 = over (vsrc1, expand_alpha (vsrc1), v1);
1915	    v2 = over (vsrc2, expand_alpha (vsrc2), v2);
1916	    v3 = over (vsrc3, expand_alpha (vsrc3), v3);
1917
1918	    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1919
1920	    w -= 4;
1921	    dst += 4;
1922	    src += 4;
1923	}
1924
1925	CHECKPOINT ();
1926
1927	while (w)
1928	{
1929	    __m64 vsrc = load8888 (src);
1930	    uint64_t d = *dst;
1931	    __m64 vdest = expand565 (to_m64 (d), 0);
1932
1933	    vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1934
1935	    *dst = to_uint64 (vdest);
1936
1937	    w--;
1938	    dst++;
1939	    src++;
1940	}
1941    }
1942
1943    _mm_empty ();
1944}
1945
1946static void
1947mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1948                             pixman_composite_info_t *info)
1949{
1950    PIXMAN_COMPOSITE_ARGS (info);
1951    uint32_t src, srca;
1952    uint32_t *dst_line, *dst;
1953    uint8_t *mask_line, *mask;
1954    int dst_stride, mask_stride;
1955    int32_t w;
1956    __m64 vsrc, vsrca;
1957    uint64_t srcsrc;
1958
1959    CHECKPOINT ();
1960
1961    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1962
1963    srca = src >> 24;
1964    if (src == 0)
1965	return;
1966
1967    srcsrc = (uint64_t)src << 32 | src;
1968
1969    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1970    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1971
1972    vsrc = load8888 (&src);
1973    vsrca = expand_alpha (vsrc);
1974
1975    while (height--)
1976    {
1977	dst = dst_line;
1978	dst_line += dst_stride;
1979	mask = mask_line;
1980	mask_line += mask_stride;
1981	w = width;
1982
1983	CHECKPOINT ();
1984
1985	while (w && (uintptr_t)dst & 7)
1986	{
1987	    uint64_t m = *mask;
1988
1989	    if (m)
1990	    {
1991		__m64 vdest = in_over (vsrc, vsrca,
1992				       expand_alpha_rev (to_m64 (m)),
1993				       load8888 (dst));
1994
1995		store8888 (dst, vdest);
1996	    }
1997
1998	    w--;
1999	    mask++;
2000	    dst++;
2001	}
2002
2003	CHECKPOINT ();
2004
2005	while (w >= 2)
2006	{
2007	    uint64_t m0, m1;
2008
2009	    m0 = *mask;
2010	    m1 = *(mask + 1);
2011
2012	    if (srca == 0xff && (m0 & m1) == 0xff)
2013	    {
2014		*(uint64_t *)dst = srcsrc;
2015	    }
2016	    else if (m0 | m1)
2017	    {
2018		__m64 vdest;
2019		__m64 dest0, dest1;
2020
2021		vdest = *(__m64 *)dst;
2022
2023		dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
2024				 expand8888 (vdest, 0));
2025		dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
2026				 expand8888 (vdest, 1));
2027
2028		*(__m64 *)dst = pack8888 (dest0, dest1);
2029	    }
2030
2031	    mask += 2;
2032	    dst += 2;
2033	    w -= 2;
2034	}
2035
2036	CHECKPOINT ();
2037
2038	if (w)
2039	{
2040	    uint64_t m = *mask;
2041
2042	    if (m)
2043	    {
2044		__m64 vdest = load8888 (dst);
2045
2046		vdest = in_over (
2047		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
2048		store8888 (dst, vdest);
2049	    }
2050	}
2051    }
2052
2053    _mm_empty ();
2054}
2055
2056static pixman_bool_t
2057mmx_fill (pixman_implementation_t *imp,
2058          uint32_t *               bits,
2059          int                      stride,
2060          int                      bpp,
2061          int                      x,
2062          int                      y,
2063          int                      width,
2064          int                      height,
2065          uint32_t		   filler)
2066{
2067    uint64_t fill;
2068    __m64 vfill;
2069    uint32_t byte_width;
2070    uint8_t     *byte_line;
2071
2072#if defined __GNUC__ && defined USE_X86_MMX
2073    __m64 v1, v2, v3, v4, v5, v6, v7;
2074#endif
2075
2076    if (bpp != 16 && bpp != 32 && bpp != 8)
2077	return FALSE;
2078
2079    if (bpp == 8)
2080    {
2081	stride = stride * (int) sizeof (uint32_t) / 1;
2082	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
2083	byte_width = width;
2084	stride *= 1;
2085        filler = (filler & 0xff) * 0x01010101;
2086    }
2087    else if (bpp == 16)
2088    {
2089	stride = stride * (int) sizeof (uint32_t) / 2;
2090	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
2091	byte_width = 2 * width;
2092	stride *= 2;
2093        filler = (filler & 0xffff) * 0x00010001;
2094    }
2095    else
2096    {
2097	stride = stride * (int) sizeof (uint32_t) / 4;
2098	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
2099	byte_width = 4 * width;
2100	stride *= 4;
2101    }
2102
2103    fill = ((uint64_t)filler << 32) | filler;
2104    vfill = to_m64 (fill);
2105
2106#if defined __GNUC__ && defined USE_X86_MMX
2107    __asm__ (
2108        "movq		%7,	%0\n"
2109        "movq		%7,	%1\n"
2110        "movq		%7,	%2\n"
2111        "movq		%7,	%3\n"
2112        "movq		%7,	%4\n"
2113        "movq		%7,	%5\n"
2114        "movq		%7,	%6\n"
2115	: "=&y" (v1), "=&y" (v2), "=&y" (v3),
2116	  "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
2117	: "y" (vfill));
2118#endif
2119
2120    while (height--)
2121    {
2122	int w;
2123	uint8_t *d = byte_line;
2124
2125	byte_line += stride;
2126	w = byte_width;
2127
2128	if (w >= 1 && ((uintptr_t)d & 1))
2129	{
2130	    *(uint8_t *)d = (filler & 0xff);
2131	    w--;
2132	    d++;
2133	}
2134
2135	if (w >= 2 && ((uintptr_t)d & 3))
2136	{
2137	    *(uint16_t *)d = filler;
2138	    w -= 2;
2139	    d += 2;
2140	}
2141
2142	while (w >= 4 && ((uintptr_t)d & 7))
2143	{
2144	    *(uint32_t *)d = filler;
2145
2146	    w -= 4;
2147	    d += 4;
2148	}
2149
2150	while (w >= 64)
2151	{
2152#if defined __GNUC__ && defined USE_X86_MMX
2153	    __asm__ (
2154	        "movq	%1,	  (%0)\n"
2155	        "movq	%2,	 8(%0)\n"
2156	        "movq	%3,	16(%0)\n"
2157	        "movq	%4,	24(%0)\n"
2158	        "movq	%5,	32(%0)\n"
2159	        "movq	%6,	40(%0)\n"
2160	        "movq	%7,	48(%0)\n"
2161	        "movq	%8,	56(%0)\n"
2162		:
2163		: "r" (d),
2164		  "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
2165		  "y" (v4), "y" (v5), "y" (v6), "y" (v7)
2166		: "memory");
2167#else
2168	    *(__m64*) (d +  0) = vfill;
2169	    *(__m64*) (d +  8) = vfill;
2170	    *(__m64*) (d + 16) = vfill;
2171	    *(__m64*) (d + 24) = vfill;
2172	    *(__m64*) (d + 32) = vfill;
2173	    *(__m64*) (d + 40) = vfill;
2174	    *(__m64*) (d + 48) = vfill;
2175	    *(__m64*) (d + 56) = vfill;
2176#endif
2177	    w -= 64;
2178	    d += 64;
2179	}
2180
2181	while (w >= 4)
2182	{
2183	    *(uint32_t *)d = filler;
2184
2185	    w -= 4;
2186	    d += 4;
2187	}
2188	if (w >= 2)
2189	{
2190	    *(uint16_t *)d = filler;
2191	    w -= 2;
2192	    d += 2;
2193	}
2194	if (w >= 1)
2195	{
2196	    *(uint8_t *)d = (filler & 0xff);
2197	    w--;
2198	    d++;
2199	}
2200
2201    }
2202
2203    _mm_empty ();
2204    return TRUE;
2205}
2206
2207static void
2208mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
2209                             pixman_composite_info_t *info)
2210{
2211    PIXMAN_COMPOSITE_ARGS (info);
2212    uint16_t    *dst_line, *dst;
2213    uint32_t    *src_line, *src, s;
2214    int dst_stride, src_stride;
2215    int32_t w;
2216
2217    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2218    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2219
2220    while (height--)
2221    {
2222	dst = dst_line;
2223	dst_line += dst_stride;
2224	src = src_line;
2225	src_line += src_stride;
2226	w = width;
2227
2228	while (w && (uintptr_t)dst & 7)
2229	{
2230	    s = *src++;
2231	    *dst = convert_8888_to_0565 (s);
2232	    dst++;
2233	    w--;
2234	}
2235
2236	while (w >= 4)
2237	{
2238	    __m64 vdest;
2239	    __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
2240	    __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
2241
2242	    vdest = pack_4xpacked565 (vsrc0, vsrc1);
2243
2244	    *(__m64 *)dst = vdest;
2245
2246	    w -= 4;
2247	    src += 4;
2248	    dst += 4;
2249	}
2250
2251	while (w)
2252	{
2253	    s = *src++;
2254	    *dst = convert_8888_to_0565 (s);
2255	    dst++;
2256	    w--;
2257	}
2258    }
2259
2260    _mm_empty ();
2261}
2262
2263static void
2264mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
2265                            pixman_composite_info_t *info)
2266{
2267    PIXMAN_COMPOSITE_ARGS (info);
2268    uint32_t src, srca;
2269    uint32_t    *dst_line, *dst;
2270    uint8_t     *mask_line, *mask;
2271    int dst_stride, mask_stride;
2272    int32_t w;
2273    __m64 vsrc;
2274    uint64_t srcsrc;
2275
2276    CHECKPOINT ();
2277
2278    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2279
2280    srca = src >> 24;
2281    if (src == 0)
2282    {
2283	mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
2284		  PIXMAN_FORMAT_BPP (dest_image->bits.format),
2285		  dest_x, dest_y, width, height, 0);
2286	return;
2287    }
2288
2289    srcsrc = (uint64_t)src << 32 | src;
2290
2291    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2292    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2293
2294    vsrc = load8888 (&src);
2295
2296    while (height--)
2297    {
2298	dst = dst_line;
2299	dst_line += dst_stride;
2300	mask = mask_line;
2301	mask_line += mask_stride;
2302	w = width;
2303
2304	CHECKPOINT ();
2305
2306	while (w && (uintptr_t)dst & 7)
2307	{
2308	    uint64_t m = *mask;
2309
2310	    if (m)
2311	    {
2312		__m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2313
2314		store8888 (dst, vdest);
2315	    }
2316	    else
2317	    {
2318		*dst = 0;
2319	    }
2320
2321	    w--;
2322	    mask++;
2323	    dst++;
2324	}
2325
2326	CHECKPOINT ();
2327
2328	while (w >= 2)
2329	{
2330	    uint64_t m0, m1;
2331	    m0 = *mask;
2332	    m1 = *(mask + 1);
2333
2334	    if (srca == 0xff && (m0 & m1) == 0xff)
2335	    {
2336		*(uint64_t *)dst = srcsrc;
2337	    }
2338	    else if (m0 | m1)
2339	    {
2340		__m64 dest0, dest1;
2341
2342		dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2343		dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2344
2345		*(__m64 *)dst = pack8888 (dest0, dest1);
2346	    }
2347	    else
2348	    {
2349		*(uint64_t *)dst = 0;
2350	    }
2351
2352	    mask += 2;
2353	    dst += 2;
2354	    w -= 2;
2355	}
2356
2357	CHECKPOINT ();
2358
2359	if (w)
2360	{
2361	    uint64_t m = *mask;
2362
2363	    if (m)
2364	    {
2365		__m64 vdest = load8888 (dst);
2366
2367		vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2368		store8888 (dst, vdest);
2369	    }
2370	    else
2371	    {
2372		*dst = 0;
2373	    }
2374	}
2375    }
2376
2377    _mm_empty ();
2378}
2379
2380static void
2381mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2382                             pixman_composite_info_t *info)
2383{
2384    PIXMAN_COMPOSITE_ARGS (info);
2385    uint32_t src, srca;
2386    uint16_t *dst_line, *dst;
2387    uint8_t *mask_line, *mask;
2388    int dst_stride, mask_stride;
2389    int32_t w;
2390    __m64 vsrc, vsrca, tmp;
2391    __m64 srcsrcsrcsrc;
2392
2393    CHECKPOINT ();
2394
2395    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2396
2397    srca = src >> 24;
2398    if (src == 0)
2399	return;
2400
2401    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2402    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2403
2404    vsrc = load8888 (&src);
2405    vsrca = expand_alpha (vsrc);
2406
2407    tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2408    srcsrcsrcsrc = expand_alpha_rev (tmp);
2409
2410    while (height--)
2411    {
2412	dst = dst_line;
2413	dst_line += dst_stride;
2414	mask = mask_line;
2415	mask_line += mask_stride;
2416	w = width;
2417
2418	CHECKPOINT ();
2419
2420	while (w && (uintptr_t)dst & 7)
2421	{
2422	    uint64_t m = *mask;
2423
2424	    if (m)
2425	    {
2426		uint64_t d = *dst;
2427		__m64 vd = to_m64 (d);
2428		__m64 vdest = in_over (
2429		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2430
2431		vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2432		*dst = to_uint64 (vd);
2433	    }
2434
2435	    w--;
2436	    mask++;
2437	    dst++;
2438	}
2439
2440	CHECKPOINT ();
2441
2442	while (w >= 4)
2443	{
2444	    uint64_t m0, m1, m2, m3;
2445	    m0 = *mask;
2446	    m1 = *(mask + 1);
2447	    m2 = *(mask + 2);
2448	    m3 = *(mask + 3);
2449
2450	    if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2451	    {
2452		*(__m64 *)dst = srcsrcsrcsrc;
2453	    }
2454	    else if (m0 | m1 | m2 | m3)
2455	    {
2456		__m64 vdest = *(__m64 *)dst;
2457		__m64 v0, v1, v2, v3;
2458		__m64 vm0, vm1, vm2, vm3;
2459
2460		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2461
2462		vm0 = to_m64 (m0);
2463		v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
2464
2465		vm1 = to_m64 (m1);
2466		v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
2467
2468		vm2 = to_m64 (m2);
2469		v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
2470
2471		vm3 = to_m64 (m3);
2472		v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
2473
2474		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
2475	    }
2476
2477	    w -= 4;
2478	    mask += 4;
2479	    dst += 4;
2480	}
2481
2482	CHECKPOINT ();
2483
2484	while (w)
2485	{
2486	    uint64_t m = *mask;
2487
2488	    if (m)
2489	    {
2490		uint64_t d = *dst;
2491		__m64 vd = to_m64 (d);
2492		__m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2493				       expand565 (vd, 0));
2494		vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2495		*dst = to_uint64 (vd);
2496	    }
2497
2498	    w--;
2499	    mask++;
2500	    dst++;
2501	}
2502    }
2503
2504    _mm_empty ();
2505}
2506
2507static void
2508mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2509                                pixman_composite_info_t *info)
2510{
2511    PIXMAN_COMPOSITE_ARGS (info);
2512    uint16_t    *dst_line, *dst;
2513    uint32_t    *src_line, *src;
2514    int dst_stride, src_stride;
2515    int32_t w;
2516
2517    CHECKPOINT ();
2518
2519    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2520    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2521
2522#if 0
2523    /* FIXME */
2524    assert (src_image->drawable == mask_image->drawable);
2525#endif
2526
2527    while (height--)
2528    {
2529	dst = dst_line;
2530	dst_line += dst_stride;
2531	src = src_line;
2532	src_line += src_stride;
2533	w = width;
2534
2535	CHECKPOINT ();
2536
2537	while (w && (uintptr_t)dst & 7)
2538	{
2539	    __m64 vsrc = load8888 (src);
2540	    uint64_t d = *dst;
2541	    __m64 vdest = expand565 (to_m64 (d), 0);
2542
2543	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2544
2545	    *dst = to_uint64 (vdest);
2546
2547	    w--;
2548	    dst++;
2549	    src++;
2550	}
2551
2552	CHECKPOINT ();
2553
2554	while (w >= 4)
2555	{
2556	    uint32_t s0, s1, s2, s3;
2557	    unsigned char a0, a1, a2, a3;
2558
2559	    s0 = *src;
2560	    s1 = *(src + 1);
2561	    s2 = *(src + 2);
2562	    s3 = *(src + 3);
2563
2564	    a0 = (s0 >> 24);
2565	    a1 = (s1 >> 24);
2566	    a2 = (s2 >> 24);
2567	    a3 = (s3 >> 24);
2568
2569	    if ((a0 & a1 & a2 & a3) == 0xFF)
2570	    {
2571		__m64 v0 = invert_colors (load8888 (&s0));
2572		__m64 v1 = invert_colors (load8888 (&s1));
2573		__m64 v2 = invert_colors (load8888 (&s2));
2574		__m64 v3 = invert_colors (load8888 (&s3));
2575
2576		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2577	    }
2578	    else if (s0 | s1 | s2 | s3)
2579	    {
2580		__m64 vdest = *(__m64 *)dst;
2581		__m64 v0, v1, v2, v3;
2582
2583		__m64 vsrc0 = load8888 (&s0);
2584		__m64 vsrc1 = load8888 (&s1);
2585		__m64 vsrc2 = load8888 (&s2);
2586		__m64 vsrc3 = load8888 (&s3);
2587
2588		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2589
2590		v0 = over_rev_non_pre (vsrc0, v0);
2591		v1 = over_rev_non_pre (vsrc1, v1);
2592		v2 = over_rev_non_pre (vsrc2, v2);
2593		v3 = over_rev_non_pre (vsrc3, v3);
2594
2595		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2596	    }
2597
2598	    w -= 4;
2599	    dst += 4;
2600	    src += 4;
2601	}
2602
2603	CHECKPOINT ();
2604
2605	while (w)
2606	{
2607	    __m64 vsrc = load8888 (src);
2608	    uint64_t d = *dst;
2609	    __m64 vdest = expand565 (to_m64 (d), 0);
2610
2611	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2612
2613	    *dst = to_uint64 (vdest);
2614
2615	    w--;
2616	    dst++;
2617	    src++;
2618	}
2619    }
2620
2621    _mm_empty ();
2622}
2623
2624static void
2625mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2626                                pixman_composite_info_t *info)
2627{
2628    PIXMAN_COMPOSITE_ARGS (info);
2629    uint32_t    *dst_line, *dst;
2630    uint32_t    *src_line, *src;
2631    int dst_stride, src_stride;
2632    int32_t w;
2633
2634    CHECKPOINT ();
2635
2636    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2637    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2638
2639#if 0
2640    /* FIXME */
2641    assert (src_image->drawable == mask_image->drawable);
2642#endif
2643
2644    while (height--)
2645    {
2646	dst = dst_line;
2647	dst_line += dst_stride;
2648	src = src_line;
2649	src_line += src_stride;
2650	w = width;
2651
2652	while (w && (uintptr_t)dst & 7)
2653	{
2654	    __m64 s = load8888 (src);
2655	    __m64 d = load8888 (dst);
2656
2657	    store8888 (dst, over_rev_non_pre (s, d));
2658
2659	    w--;
2660	    dst++;
2661	    src++;
2662	}
2663
2664	while (w >= 2)
2665	{
2666	    uint32_t s0, s1;
2667	    unsigned char a0, a1;
2668	    __m64 d0, d1;
2669
2670	    s0 = *src;
2671	    s1 = *(src + 1);
2672
2673	    a0 = (s0 >> 24);
2674	    a1 = (s1 >> 24);
2675
2676	    if ((a0 & a1) == 0xFF)
2677	    {
2678		d0 = invert_colors (load8888 (&s0));
2679		d1 = invert_colors (load8888 (&s1));
2680
2681		*(__m64 *)dst = pack8888 (d0, d1);
2682	    }
2683	    else if (s0 | s1)
2684	    {
2685		__m64 vdest = *(__m64 *)dst;
2686
2687		d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
2688		d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
2689
2690		*(__m64 *)dst = pack8888 (d0, d1);
2691	    }
2692
2693	    w -= 2;
2694	    dst += 2;
2695	    src += 2;
2696	}
2697
2698	if (w)
2699	{
2700	    __m64 s = load8888 (src);
2701	    __m64 d = load8888 (dst);
2702
2703	    store8888 (dst, over_rev_non_pre (s, d));
2704	}
2705    }
2706
2707    _mm_empty ();
2708}
2709
2710static void
2711mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2712                                   pixman_composite_info_t *info)
2713{
2714    PIXMAN_COMPOSITE_ARGS (info);
2715    uint32_t src;
2716    uint16_t    *dst_line;
2717    uint32_t    *mask_line;
2718    int dst_stride, mask_stride;
2719    __m64 vsrc, vsrca;
2720
2721    CHECKPOINT ();
2722
2723    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2724
2725    if (src == 0)
2726	return;
2727
2728    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2729    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2730
2731    vsrc = load8888 (&src);
2732    vsrca = expand_alpha (vsrc);
2733
2734    while (height--)
2735    {
2736	int twidth = width;
2737	uint32_t *p = (uint32_t *)mask_line;
2738	uint16_t *q = (uint16_t *)dst_line;
2739
2740	while (twidth && ((uintptr_t)q & 7))
2741	{
2742	    uint32_t m = *(uint32_t *)p;
2743
2744	    if (m)
2745	    {
2746		uint64_t d = *q;
2747		__m64 vdest = expand565 (to_m64 (d), 0);
2748		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2749		*q = to_uint64 (vdest);
2750	    }
2751
2752	    twidth--;
2753	    p++;
2754	    q++;
2755	}
2756
2757	while (twidth >= 4)
2758	{
2759	    uint32_t m0, m1, m2, m3;
2760
2761	    m0 = *p;
2762	    m1 = *(p + 1);
2763	    m2 = *(p + 2);
2764	    m3 = *(p + 3);
2765
2766	    if ((m0 | m1 | m2 | m3))
2767	    {
2768		__m64 vdest = *(__m64 *)q;
2769		__m64 v0, v1, v2, v3;
2770
2771		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2772
2773		v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
2774		v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
2775		v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
2776		v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
2777
2778		*(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
2779	    }
2780	    twidth -= 4;
2781	    p += 4;
2782	    q += 4;
2783	}
2784
2785	while (twidth)
2786	{
2787	    uint32_t m;
2788
2789	    m = *(uint32_t *)p;
2790	    if (m)
2791	    {
2792		uint64_t d = *q;
2793		__m64 vdest = expand565 (to_m64 (d), 0);
2794		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2795		*q = to_uint64 (vdest);
2796	    }
2797
2798	    twidth--;
2799	    p++;
2800	    q++;
2801	}
2802
2803	mask_line += mask_stride;
2804	dst_line += dst_stride;
2805    }
2806
2807    _mm_empty ();
2808}
2809
2810static void
2811mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2812                        pixman_composite_info_t *info)
2813{
2814    PIXMAN_COMPOSITE_ARGS (info);
2815    uint8_t *dst_line, *dst;
2816    uint8_t *mask_line, *mask;
2817    int dst_stride, mask_stride;
2818    int32_t w;
2819    uint32_t src;
2820    uint8_t sa;
2821    __m64 vsrc, vsrca;
2822
2823    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2824    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2825
2826    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2827
2828    sa = src >> 24;
2829
2830    vsrc = load8888 (&src);
2831    vsrca = expand_alpha (vsrc);
2832
2833    while (height--)
2834    {
2835	dst = dst_line;
2836	dst_line += dst_stride;
2837	mask = mask_line;
2838	mask_line += mask_stride;
2839	w = width;
2840
2841	while (w && (uintptr_t)dst & 7)
2842	{
2843	    uint16_t tmp;
2844	    uint8_t a;
2845	    uint32_t m, d;
2846
2847	    a = *mask++;
2848	    d = *dst;
2849
2850	    m = MUL_UN8 (sa, a, tmp);
2851	    d = MUL_UN8 (m, d, tmp);
2852
2853	    *dst++ = d;
2854	    w--;
2855	}
2856
2857	while (w >= 4)
2858	{
2859	    __m64 vmask;
2860	    __m64 vdest;
2861
2862	    vmask = load8888u ((uint32_t *)mask);
2863	    vdest = load8888 ((uint32_t *)dst);
2864
2865	    store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
2866
2867	    dst += 4;
2868	    mask += 4;
2869	    w -= 4;
2870	}
2871
2872	while (w--)
2873	{
2874	    uint16_t tmp;
2875	    uint8_t a;
2876	    uint32_t m, d;
2877
2878	    a = *mask++;
2879	    d = *dst;
2880
2881	    m = MUL_UN8 (sa, a, tmp);
2882	    d = MUL_UN8 (m, d, tmp);
2883
2884	    *dst++ = d;
2885	}
2886    }
2887
2888    _mm_empty ();
2889}
2890
2891static void
2892mmx_composite_in_8_8 (pixman_implementation_t *imp,
2893                      pixman_composite_info_t *info)
2894{
2895    PIXMAN_COMPOSITE_ARGS (info);
2896    uint8_t     *dst_line, *dst;
2897    uint8_t     *src_line, *src;
2898    int src_stride, dst_stride;
2899    int32_t w;
2900
2901    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2902    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2903
2904    while (height--)
2905    {
2906	dst = dst_line;
2907	dst_line += dst_stride;
2908	src = src_line;
2909	src_line += src_stride;
2910	w = width;
2911
2912	while (w && (uintptr_t)dst & 3)
2913	{
2914	    uint8_t s, d;
2915	    uint16_t tmp;
2916
2917	    s = *src;
2918	    d = *dst;
2919
2920	    *dst = MUL_UN8 (s, d, tmp);
2921
2922	    src++;
2923	    dst++;
2924	    w--;
2925	}
2926
2927	while (w >= 4)
2928	{
2929	    uint32_t *s = (uint32_t *)src;
2930	    uint32_t *d = (uint32_t *)dst;
2931
2932	    store8888 (d, in (load8888u (s), load8888 (d)));
2933
2934	    w -= 4;
2935	    dst += 4;
2936	    src += 4;
2937	}
2938
2939	while (w--)
2940	{
2941	    uint8_t s, d;
2942	    uint16_t tmp;
2943
2944	    s = *src;
2945	    d = *dst;
2946
2947	    *dst = MUL_UN8 (s, d, tmp);
2948
2949	    src++;
2950	    dst++;
2951	}
2952    }
2953
2954    _mm_empty ();
2955}
2956
2957static void
2958mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2959			 pixman_composite_info_t *info)
2960{
2961    PIXMAN_COMPOSITE_ARGS (info);
2962    uint8_t     *dst_line, *dst;
2963    uint8_t     *mask_line, *mask;
2964    int dst_stride, mask_stride;
2965    int32_t w;
2966    uint32_t src;
2967    uint8_t sa;
2968    __m64 vsrc, vsrca;
2969
2970    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2971    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2972
2973    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2974
2975    sa = src >> 24;
2976
2977    if (src == 0)
2978	return;
2979
2980    vsrc = load8888 (&src);
2981    vsrca = expand_alpha (vsrc);
2982
2983    while (height--)
2984    {
2985	dst = dst_line;
2986	dst_line += dst_stride;
2987	mask = mask_line;
2988	mask_line += mask_stride;
2989	w = width;
2990
2991	while (w && (uintptr_t)dst & 3)
2992	{
2993	    uint16_t tmp;
2994	    uint16_t a;
2995	    uint32_t m, d;
2996	    uint32_t r;
2997
2998	    a = *mask++;
2999	    d = *dst;
3000
3001	    m = MUL_UN8 (sa, a, tmp);
3002	    r = ADD_UN8 (m, d, tmp);
3003
3004	    *dst++ = r;
3005	    w--;
3006	}
3007
3008	while (w >= 4)
3009	{
3010	    __m64 vmask;
3011	    __m64 vdest;
3012
3013	    vmask = load8888u ((uint32_t *)mask);
3014	    vdest = load8888 ((uint32_t *)dst);
3015
3016	    store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
3017
3018	    dst += 4;
3019	    mask += 4;
3020	    w -= 4;
3021	}
3022
3023	while (w--)
3024	{
3025	    uint16_t tmp;
3026	    uint16_t a;
3027	    uint32_t m, d;
3028	    uint32_t r;
3029
3030	    a = *mask++;
3031	    d = *dst;
3032
3033	    m = MUL_UN8 (sa, a, tmp);
3034	    r = ADD_UN8 (m, d, tmp);
3035
3036	    *dst++ = r;
3037	}
3038    }
3039
3040    _mm_empty ();
3041}
3042
3043static void
3044mmx_composite_add_8_8 (pixman_implementation_t *imp,
3045		       pixman_composite_info_t *info)
3046{
3047    PIXMAN_COMPOSITE_ARGS (info);
3048    uint8_t *dst_line, *dst;
3049    uint8_t *src_line, *src;
3050    int dst_stride, src_stride;
3051    int32_t w;
3052    uint8_t s, d;
3053    uint16_t t;
3054
3055    CHECKPOINT ();
3056
3057    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
3058    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
3059
3060    while (height--)
3061    {
3062	dst = dst_line;
3063	dst_line += dst_stride;
3064	src = src_line;
3065	src_line += src_stride;
3066	w = width;
3067
3068	while (w && (uintptr_t)dst & 7)
3069	{
3070	    s = *src;
3071	    d = *dst;
3072	    t = d + s;
3073	    s = t | (0 - (t >> 8));
3074	    *dst = s;
3075
3076	    dst++;
3077	    src++;
3078	    w--;
3079	}
3080
3081	while (w >= 8)
3082	{
3083	    *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3084	    dst += 8;
3085	    src += 8;
3086	    w -= 8;
3087	}
3088
3089	while (w)
3090	{
3091	    s = *src;
3092	    d = *dst;
3093	    t = d + s;
3094	    s = t | (0 - (t >> 8));
3095	    *dst = s;
3096
3097	    dst++;
3098	    src++;
3099	    w--;
3100	}
3101    }
3102
3103    _mm_empty ();
3104}
3105
3106static void
3107mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
3108                             pixman_composite_info_t *info)
3109{
3110    PIXMAN_COMPOSITE_ARGS (info);
3111    uint16_t    *dst_line, *dst;
3112    uint32_t	d;
3113    uint16_t    *src_line, *src;
3114    uint32_t	s;
3115    int dst_stride, src_stride;
3116    int32_t w;
3117
3118    CHECKPOINT ();
3119
3120    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
3121    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3122
3123    while (height--)
3124    {
3125	dst = dst_line;
3126	dst_line += dst_stride;
3127	src = src_line;
3128	src_line += src_stride;
3129	w = width;
3130
3131	while (w && (uintptr_t)dst & 7)
3132	{
3133	    s = *src++;
3134	    if (s)
3135	    {
3136		d = *dst;
3137		s = convert_0565_to_8888 (s);
3138		if (d)
3139		{
3140		    d = convert_0565_to_8888 (d);
3141		    UN8x4_ADD_UN8x4 (s, d);
3142		}
3143		*dst = convert_8888_to_0565 (s);
3144	    }
3145	    dst++;
3146	    w--;
3147	}
3148
3149	while (w >= 4)
3150	{
3151	    __m64 vdest = *(__m64 *)dst;
3152	    __m64 vsrc = ldq_u ((__m64 *)src);
3153	    __m64 vd0, vd1;
3154	    __m64 vs0, vs1;
3155
3156	    expand_4xpacked565 (vdest, &vd0, &vd1, 0);
3157	    expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
3158
3159	    vd0 = _mm_adds_pu8 (vd0, vs0);
3160	    vd1 = _mm_adds_pu8 (vd1, vs1);
3161
3162	    *(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
3163
3164	    dst += 4;
3165	    src += 4;
3166	    w -= 4;
3167	}
3168
3169	while (w--)
3170	{
3171	    s = *src++;
3172	    if (s)
3173	    {
3174		d = *dst;
3175		s = convert_0565_to_8888 (s);
3176		if (d)
3177		{
3178		    d = convert_0565_to_8888 (d);
3179		    UN8x4_ADD_UN8x4 (s, d);
3180		}
3181		*dst = convert_8888_to_0565 (s);
3182	    }
3183	    dst++;
3184	}
3185    }
3186
3187    _mm_empty ();
3188}
3189
3190static void
3191mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
3192                             pixman_composite_info_t *info)
3193{
3194    PIXMAN_COMPOSITE_ARGS (info);
3195    uint32_t    *dst_line, *dst;
3196    uint32_t    *src_line, *src;
3197    int dst_stride, src_stride;
3198    int32_t w;
3199
3200    CHECKPOINT ();
3201
3202    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3203    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3204
3205    while (height--)
3206    {
3207	dst = dst_line;
3208	dst_line += dst_stride;
3209	src = src_line;
3210	src_line += src_stride;
3211	w = width;
3212
3213	while (w && (uintptr_t)dst & 7)
3214	{
3215	    store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3216	                              load ((const uint32_t *)dst)));
3217	    dst++;
3218	    src++;
3219	    w--;
3220	}
3221
3222	while (w >= 2)
3223	{
3224	    *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3225	    dst += 2;
3226	    src += 2;
3227	    w -= 2;
3228	}
3229
3230	if (w)
3231	{
3232	    store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3233	                              load ((const uint32_t *)dst)));
3234
3235	}
3236    }
3237
3238    _mm_empty ();
3239}
3240
3241static pixman_bool_t
3242mmx_blt (pixman_implementation_t *imp,
3243         uint32_t *               src_bits,
3244         uint32_t *               dst_bits,
3245         int                      src_stride,
3246         int                      dst_stride,
3247         int                      src_bpp,
3248         int                      dst_bpp,
3249         int                      src_x,
3250         int                      src_y,
3251         int                      dest_x,
3252         int                      dest_y,
3253         int                      width,
3254         int                      height)
3255{
3256    uint8_t *   src_bytes;
3257    uint8_t *   dst_bytes;
3258    int byte_width;
3259
3260    if (src_bpp != dst_bpp)
3261	return FALSE;
3262
3263    if (src_bpp == 16)
3264    {
3265	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
3266	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
3267	src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
3268	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3269	byte_width = 2 * width;
3270	src_stride *= 2;
3271	dst_stride *= 2;
3272    }
3273    else if (src_bpp == 32)
3274    {
3275	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
3276	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
3277	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
3278	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3279	byte_width = 4 * width;
3280	src_stride *= 4;
3281	dst_stride *= 4;
3282    }
3283    else
3284    {
3285	return FALSE;
3286    }
3287
3288    while (height--)
3289    {
3290	int w;
3291	uint8_t *s = src_bytes;
3292	uint8_t *d = dst_bytes;
3293	src_bytes += src_stride;
3294	dst_bytes += dst_stride;
3295	w = byte_width;
3296
3297	if (w >= 1 && ((uintptr_t)d & 1))
3298	{
3299	    *(uint8_t *)d = *(uint8_t *)s;
3300	    w -= 1;
3301	    s += 1;
3302	    d += 1;
3303	}
3304
3305	if (w >= 2 && ((uintptr_t)d & 3))
3306	{
3307	    *(uint16_t *)d = *(uint16_t *)s;
3308	    w -= 2;
3309	    s += 2;
3310	    d += 2;
3311	}
3312
3313	while (w >= 4 && ((uintptr_t)d & 7))
3314	{
3315	    *(uint32_t *)d = ldl_u ((uint32_t *)s);
3316
3317	    w -= 4;
3318	    s += 4;
3319	    d += 4;
3320	}
3321
3322	while (w >= 64)
3323	{
3324#if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
3325	    __asm__ (
3326	        "movq	  (%1),	  %%mm0\n"
3327	        "movq	 8(%1),	  %%mm1\n"
3328	        "movq	16(%1),	  %%mm2\n"
3329	        "movq	24(%1),	  %%mm3\n"
3330	        "movq	32(%1),	  %%mm4\n"
3331	        "movq	40(%1),	  %%mm5\n"
3332	        "movq	48(%1),	  %%mm6\n"
3333	        "movq	56(%1),	  %%mm7\n"
3334
3335	        "movq	%%mm0,	  (%0)\n"
3336	        "movq	%%mm1,	 8(%0)\n"
3337	        "movq	%%mm2,	16(%0)\n"
3338	        "movq	%%mm3,	24(%0)\n"
3339	        "movq	%%mm4,	32(%0)\n"
3340	        "movq	%%mm5,	40(%0)\n"
3341	        "movq	%%mm6,	48(%0)\n"
3342	        "movq	%%mm7,	56(%0)\n"
3343		:
3344		: "r" (d), "r" (s)
3345		: "memory",
3346		  "%mm0", "%mm1", "%mm2", "%mm3",
3347		  "%mm4", "%mm5", "%mm6", "%mm7");
3348#else
3349	    __m64 v0 = ldq_u ((__m64 *)(s + 0));
3350	    __m64 v1 = ldq_u ((__m64 *)(s + 8));
3351	    __m64 v2 = ldq_u ((__m64 *)(s + 16));
3352	    __m64 v3 = ldq_u ((__m64 *)(s + 24));
3353	    __m64 v4 = ldq_u ((__m64 *)(s + 32));
3354	    __m64 v5 = ldq_u ((__m64 *)(s + 40));
3355	    __m64 v6 = ldq_u ((__m64 *)(s + 48));
3356	    __m64 v7 = ldq_u ((__m64 *)(s + 56));
3357	    *(__m64 *)(d + 0)  = v0;
3358	    *(__m64 *)(d + 8)  = v1;
3359	    *(__m64 *)(d + 16) = v2;
3360	    *(__m64 *)(d + 24) = v3;
3361	    *(__m64 *)(d + 32) = v4;
3362	    *(__m64 *)(d + 40) = v5;
3363	    *(__m64 *)(d + 48) = v6;
3364	    *(__m64 *)(d + 56) = v7;
3365#endif
3366
3367	    w -= 64;
3368	    s += 64;
3369	    d += 64;
3370	}
3371	while (w >= 4)
3372	{
3373	    *(uint32_t *)d = ldl_u ((uint32_t *)s);
3374
3375	    w -= 4;
3376	    s += 4;
3377	    d += 4;
3378	}
3379	if (w >= 2)
3380	{
3381	    *(uint16_t *)d = *(uint16_t *)s;
3382	    w -= 2;
3383	    s += 2;
3384	    d += 2;
3385	}
3386    }
3387
3388    _mm_empty ();
3389
3390    return TRUE;
3391}
3392
3393static void
3394mmx_composite_copy_area (pixman_implementation_t *imp,
3395                         pixman_composite_info_t *info)
3396{
3397    PIXMAN_COMPOSITE_ARGS (info);
3398
3399    mmx_blt (imp, src_image->bits.bits,
3400	     dest_image->bits.bits,
3401	     src_image->bits.rowstride,
3402	     dest_image->bits.rowstride,
3403	     PIXMAN_FORMAT_BPP (src_image->bits.format),
3404	     PIXMAN_FORMAT_BPP (dest_image->bits.format),
3405	     src_x, src_y, dest_x, dest_y, width, height);
3406}
3407
3408static void
3409mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3410                                pixman_composite_info_t *info)
3411{
3412    PIXMAN_COMPOSITE_ARGS (info);
3413    uint32_t  *src, *src_line;
3414    uint32_t  *dst, *dst_line;
3415    uint8_t  *mask, *mask_line;
3416    int src_stride, mask_stride, dst_stride;
3417    int32_t w;
3418
3419    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3420    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3421    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3422
3423    while (height--)
3424    {
3425	src = src_line;
3426	src_line += src_stride;
3427	dst = dst_line;
3428	dst_line += dst_stride;
3429	mask = mask_line;
3430	mask_line += mask_stride;
3431
3432	w = width;
3433
3434	while (w--)
3435	{
3436	    uint64_t m = *mask;
3437
3438	    if (m)
3439	    {
3440		uint32_t ssrc = *src | 0xff000000;
3441		__m64 s = load8888 (&ssrc);
3442
3443		if (m == 0xff)
3444		{
3445		    store8888 (dst, s);
3446		}
3447		else
3448		{
3449		    __m64 sa = expand_alpha (s);
3450		    __m64 vm = expand_alpha_rev (to_m64 (m));
3451		    __m64 vdest = in_over (s, sa, vm, load8888 (dst));
3452
3453		    store8888 (dst, vdest);
3454		}
3455	    }
3456
3457	    mask++;
3458	    dst++;
3459	    src++;
3460	}
3461    }
3462
3463    _mm_empty ();
3464}
3465
3466static void
3467mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
3468                                   pixman_composite_info_t *info)
3469{
3470    PIXMAN_COMPOSITE_ARGS (info);
3471    uint32_t src;
3472    uint32_t    *dst_line, *dst;
3473    int32_t w;
3474    int dst_stride;
3475    __m64 vsrc;
3476
3477    CHECKPOINT ();
3478
3479    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3480
3481    if (src == 0)
3482	return;
3483
3484    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3485
3486    vsrc = load8888 (&src);
3487
3488    while (height--)
3489    {
3490	dst = dst_line;
3491	dst_line += dst_stride;
3492	w = width;
3493
3494	CHECKPOINT ();
3495
3496	while (w && (uintptr_t)dst & 7)
3497	{
3498	    __m64 vdest = load8888 (dst);
3499
3500	    store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3501
3502	    w--;
3503	    dst++;
3504	}
3505
3506	while (w >= 2)
3507	{
3508	    __m64 vdest = *(__m64 *)dst;
3509	    __m64 dest0 = expand8888 (vdest, 0);
3510	    __m64 dest1 = expand8888 (vdest, 1);
3511
3512
3513	    dest0 = over (dest0, expand_alpha (dest0), vsrc);
3514	    dest1 = over (dest1, expand_alpha (dest1), vsrc);
3515
3516	    *(__m64 *)dst = pack8888 (dest0, dest1);
3517
3518	    dst += 2;
3519	    w -= 2;
3520	}
3521
3522	CHECKPOINT ();
3523
3524	if (w)
3525	{
3526	    __m64 vdest = load8888 (dst);
3527
3528	    store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3529	}
3530    }
3531
3532    _mm_empty ();
3533}
3534
3535#define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
3536#define BMSK (BSHIFT - 1)
3537
3538#define BILINEAR_DECLARE_VARIABLES						\
3539    const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt);				\
3540    const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb);				\
3541    const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT);	\
3542    const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1);				\
3543    const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK);			\
3544    const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x);		\
3545    const __m64 mm_zero = _mm_setzero_si64 ();					\
3546    __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
3547
3548#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
3549do {										\
3550    /* fetch 2x2 pixel block into 2 mmx registers */				\
3551    __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]);		\
3552    __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]);		\
3553    /* vertical interpolation */						\
3554    __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt);		\
3555    __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt);		\
3556    __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb);		\
3557    __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb);		\
3558    __m64 hi = _mm_add_pi16 (t_hi, b_hi);					\
3559    __m64 lo = _mm_add_pi16 (t_lo, b_lo);					\
3560    vx += unit_x;								\
3561    if (BILINEAR_INTERPOLATION_BITS < 8)					\
3562    {										\
3563	/* calculate horizontal weights */					\
3564	__m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7,		\
3565			  _mm_srli_pi16 (mm_x,					\
3566					 16 - BILINEAR_INTERPOLATION_BITS)));	\
3567	/* horizontal interpolation */						\
3568	__m64 p = _mm_unpacklo_pi16 (lo, hi);					\
3569	__m64 q = _mm_unpackhi_pi16 (lo, hi);					\
3570	lo = _mm_madd_pi16 (p, mm_wh);						\
3571	hi = _mm_madd_pi16 (q, mm_wh);						\
3572    }										\
3573    else									\
3574    {										\
3575	/* calculate horizontal weights */					\
3576	__m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x,		\
3577					16 - BILINEAR_INTERPOLATION_BITS));	\
3578	__m64 mm_wh_hi = _mm_srli_pi16 (mm_x,					\
3579					16 - BILINEAR_INTERPOLATION_BITS);	\
3580	/* horizontal interpolation */						\
3581	__m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo);				\
3582	__m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi);				\
3583	__m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo);				\
3584	__m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi);				\
3585	lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo),		\
3586			   _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi));		\
3587	hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo),		\
3588			   _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi));		\
3589    }										\
3590    mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
3591    /* shift and pack the result */						\
3592    hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2);			\
3593    lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2);			\
3594    lo = _mm_packs_pi32 (lo, hi);						\
3595    lo = _mm_packs_pu16 (lo, lo);						\
3596    pix = lo;									\
3597} while (0)
3598
3599#define BILINEAR_SKIP_ONE_PIXEL()						\
3600do {										\
3601    vx += unit_x;								\
3602    mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
3603} while(0)
3604
3605static force_inline void
3606scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t *       dst,
3607					    const uint32_t * mask,
3608					    const uint32_t * src_top,
3609					    const uint32_t * src_bottom,
3610					    int32_t          w,
3611					    int              wt,
3612					    int              wb,
3613					    pixman_fixed_t   vx,
3614					    pixman_fixed_t   unit_x,
3615					    pixman_fixed_t   max_vx,
3616					    pixman_bool_t    zero_src)
3617{
3618    BILINEAR_DECLARE_VARIABLES;
3619    __m64 pix;
3620
3621    while (w--)
3622    {
3623	BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
3624	store (dst, pix);
3625	dst++;
3626    }
3627
3628    _mm_empty ();
3629}
3630
3631FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
3632			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
3633			       uint32_t, uint32_t, uint32_t,
3634			       COVER, FLAG_NONE)
3635FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
3636			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
3637			       uint32_t, uint32_t, uint32_t,
3638			       PAD, FLAG_NONE)
3639FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
3640			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
3641			       uint32_t, uint32_t, uint32_t,
3642			       NONE, FLAG_NONE)
3643FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
3644			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
3645			       uint32_t, uint32_t, uint32_t,
3646			       NORMAL, FLAG_NONE)
3647
3648static force_inline void
3649scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t *       dst,
3650					     const uint32_t * mask,
3651					     const uint32_t * src_top,
3652					     const uint32_t * src_bottom,
3653					     int32_t          w,
3654					     int              wt,
3655					     int              wb,
3656					     pixman_fixed_t   vx,
3657					     pixman_fixed_t   unit_x,
3658					     pixman_fixed_t   max_vx,
3659					     pixman_bool_t    zero_src)
3660{
3661    BILINEAR_DECLARE_VARIABLES;
3662    __m64 pix1, pix2;
3663
3664    while (w)
3665    {
3666	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3667
3668	if (!is_zero (pix1))
3669	{
3670	    pix2 = load (dst);
3671	    store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
3672	}
3673
3674	w--;
3675	dst++;
3676    }
3677
3678    _mm_empty ();
3679}
3680
3681FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
3682			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
3683			       uint32_t, uint32_t, uint32_t,
3684			       COVER, FLAG_NONE)
3685FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
3686			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
3687			       uint32_t, uint32_t, uint32_t,
3688			       PAD, FLAG_NONE)
3689FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
3690			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
3691			       uint32_t, uint32_t, uint32_t,
3692			       NONE, FLAG_NONE)
3693FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
3694			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
3695			       uint32_t, uint32_t, uint32_t,
3696			       NORMAL, FLAG_NONE)
3697
3698static force_inline void
3699scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t *       dst,
3700					       const uint8_t  * mask,
3701					       const uint32_t * src_top,
3702					       const uint32_t * src_bottom,
3703					       int32_t          w,
3704					       int              wt,
3705					       int              wb,
3706					       pixman_fixed_t   vx,
3707					       pixman_fixed_t   unit_x,
3708					       pixman_fixed_t   max_vx,
3709					       pixman_bool_t    zero_src)
3710{
3711    BILINEAR_DECLARE_VARIABLES;
3712    __m64 pix1, pix2;
3713    uint32_t m;
3714
3715    while (w)
3716    {
3717	m = (uint32_t) *mask++;
3718
3719	if (m)
3720	{
3721	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3722
3723	    if (m == 0xff && is_opaque (pix1))
3724	    {
3725		store (dst, pix1);
3726	    }
3727	    else
3728	    {
3729		__m64 ms, md, ma, msa;
3730
3731		pix2 = load (dst);
3732		ma = expand_alpha_rev (to_m64 (m));
3733		ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
3734		md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
3735
3736		msa = expand_alpha (ms);
3737
3738		store8888 (dst, (in_over (ms, msa, ma, md)));
3739	    }
3740	}
3741	else
3742	{
3743	    BILINEAR_SKIP_ONE_PIXEL ();
3744	}
3745
3746	w--;
3747	dst++;
3748    }
3749
3750    _mm_empty ();
3751}
3752
3753FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
3754			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3755			       uint32_t, uint8_t, uint32_t,
3756			       COVER, FLAG_HAVE_NON_SOLID_MASK)
3757FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
3758			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3759			       uint32_t, uint8_t, uint32_t,
3760			       PAD, FLAG_HAVE_NON_SOLID_MASK)
3761FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
3762			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3763			       uint32_t, uint8_t, uint32_t,
3764			       NONE, FLAG_HAVE_NON_SOLID_MASK)
3765FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
3766			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3767			       uint32_t, uint8_t, uint32_t,
3768			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
3769
3770static uint32_t *
3771mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
3772{
3773    int w = iter->width;
3774    uint32_t *dst = iter->buffer;
3775    uint32_t *src = (uint32_t *)iter->bits;
3776
3777    iter->bits += iter->stride;
3778
3779    while (w && ((uintptr_t)dst) & 7)
3780    {
3781	*dst++ = (*src++) | 0xff000000;
3782	w--;
3783    }
3784
3785    while (w >= 8)
3786    {
3787	__m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
3788	__m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
3789	__m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
3790	__m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
3791
3792	*(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
3793	*(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
3794	*(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
3795	*(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
3796
3797	dst += 8;
3798	src += 8;
3799	w -= 8;
3800    }
3801
3802    while (w)
3803    {
3804	*dst++ = (*src++) | 0xff000000;
3805	w--;
3806    }
3807
3808    _mm_empty ();
3809    return iter->buffer;
3810}
3811
3812static uint32_t *
3813mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
3814{
3815    int w = iter->width;
3816    uint32_t *dst = iter->buffer;
3817    uint16_t *src = (uint16_t *)iter->bits;
3818
3819    iter->bits += iter->stride;
3820
3821    while (w && ((uintptr_t)dst) & 0x0f)
3822    {
3823	uint16_t s = *src++;
3824
3825	*dst++ = convert_0565_to_8888 (s);
3826	w--;
3827    }
3828
3829    while (w >= 4)
3830    {
3831	__m64 vsrc = ldq_u ((__m64 *)src);
3832	__m64 mm0, mm1;
3833
3834	expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
3835
3836	*(__m64 *)(dst + 0) = mm0;
3837	*(__m64 *)(dst + 2) = mm1;
3838
3839	dst += 4;
3840	src += 4;
3841	w -= 4;
3842    }
3843
3844    while (w)
3845    {
3846	uint16_t s = *src++;
3847
3848	*dst++ = convert_0565_to_8888 (s);
3849	w--;
3850    }
3851
3852    _mm_empty ();
3853    return iter->buffer;
3854}
3855
3856static uint32_t *
3857mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
3858{
3859    int w = iter->width;
3860    uint32_t *dst = iter->buffer;
3861    uint8_t *src = iter->bits;
3862
3863    iter->bits += iter->stride;
3864
3865    while (w && (((uintptr_t)dst) & 15))
3866    {
3867        *dst++ = *(src++) << 24;
3868        w--;
3869    }
3870
3871    while (w >= 8)
3872    {
3873	__m64 mm0 = ldq_u ((__m64 *)src);
3874
3875	__m64 mm1 = _mm_unpacklo_pi8  (_mm_setzero_si64(), mm0);
3876	__m64 mm2 = _mm_unpackhi_pi8  (_mm_setzero_si64(), mm0);
3877	__m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
3878	__m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
3879	__m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
3880	__m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
3881
3882	*(__m64 *)(dst + 0) = mm3;
3883	*(__m64 *)(dst + 2) = mm4;
3884	*(__m64 *)(dst + 4) = mm5;
3885	*(__m64 *)(dst + 6) = mm6;
3886
3887	dst += 8;
3888	src += 8;
3889	w -= 8;
3890    }
3891
3892    while (w)
3893    {
3894	*dst++ = *(src++) << 24;
3895	w--;
3896    }
3897
3898    _mm_empty ();
3899    return iter->buffer;
3900}
3901
3902typedef struct
3903{
3904    pixman_format_code_t	format;
3905    pixman_iter_get_scanline_t	get_scanline;
3906} fetcher_info_t;
3907
3908static const fetcher_info_t fetchers[] =
3909{
3910    { PIXMAN_x8r8g8b8,		mmx_fetch_x8r8g8b8 },
3911    { PIXMAN_r5g6b5,		mmx_fetch_r5g6b5 },
3912    { PIXMAN_a8,		mmx_fetch_a8 },
3913    { PIXMAN_null }
3914};
3915
3916static pixman_bool_t
3917mmx_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
3918{
3919    pixman_image_t *image = iter->image;
3920
3921#define FLAGS								\
3922    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
3923     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
3924
3925    if ((iter->iter_flags & ITER_NARROW)			&&
3926	(iter->image_flags & FLAGS) == FLAGS)
3927    {
3928	const fetcher_info_t *f;
3929
3930	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
3931	{
3932	    if (image->common.extended_format_code == f->format)
3933	    {
3934		uint8_t *b = (uint8_t *)image->bits.bits;
3935		int s = image->bits.rowstride * 4;
3936
3937		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
3938		iter->stride = s;
3939
3940		iter->get_scanline = f->get_scanline;
3941		return TRUE;
3942	    }
3943	}
3944    }
3945
3946    return FALSE;
3947}
3948
3949static const pixman_fast_path_t mmx_fast_paths[] =
3950{
3951    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   mmx_composite_over_n_8_0565       ),
3952    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   mmx_composite_over_n_8_0565       ),
3953    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, mmx_composite_over_n_8_8888       ),
3954    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, mmx_composite_over_n_8_8888       ),
3955    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, mmx_composite_over_n_8_8888       ),
3956    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, mmx_composite_over_n_8_8888       ),
3957    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3958    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3959    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   mmx_composite_over_n_8888_0565_ca ),
3960    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3961    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3962    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   mmx_composite_over_n_8888_0565_ca ),
3963    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, mmx_composite_over_pixbuf_8888    ),
3964    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, mmx_composite_over_pixbuf_8888    ),
3965    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   mmx_composite_over_pixbuf_0565    ),
3966    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, mmx_composite_over_pixbuf_8888    ),
3967    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, mmx_composite_over_pixbuf_8888    ),
3968    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   mmx_composite_over_pixbuf_0565    ),
3969    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_x888_n_8888    ),
3970    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_x888_n_8888    ),
3971    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_x888_n_8888    ),
3972    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_x888_n_8888    ),
3973    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_8888_n_8888    ),
3974    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_8888_n_8888    ),
3975    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_8888_n_8888    ),
3976    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_8888_n_8888    ),
3977    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, mmx_composite_over_x888_8_8888    ),
3978    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
3979    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       x8b8g8r8, mmx_composite_over_x888_8_8888    ),
3980    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       a8b8g8r8, mmx_composite_over_x888_8_8888    ),
3981    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, mmx_composite_over_n_8888         ),
3982    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, mmx_composite_over_n_8888         ),
3983    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   mmx_composite_over_n_0565         ),
3984    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     b5g6r5,   mmx_composite_over_n_0565         ),
3985    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
3986    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
3987
3988    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, mmx_composite_over_8888_8888      ),
3989    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, mmx_composite_over_8888_8888      ),
3990    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   mmx_composite_over_8888_0565      ),
3991    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, mmx_composite_over_8888_8888      ),
3992    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, mmx_composite_over_8888_8888      ),
3993    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   mmx_composite_over_8888_0565      ),
3994
3995    PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888),
3996    PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888),
3997
3998    PIXMAN_STD_FAST_PATH    (ADD,  r5g6b5,   null,     r5g6b5,   mmx_composite_add_0565_0565       ),
3999    PIXMAN_STD_FAST_PATH    (ADD,  b5g6r5,   null,     b5g6r5,   mmx_composite_add_0565_0565       ),
4000    PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ),
4001    PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ),
4002    PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8		   ),
4003    PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
4004
4005    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
4006    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
4007    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
4008    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
4009    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
4010    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ),
4011    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ),
4012    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, mmx_composite_src_n_8_8888        ),
4013    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_copy_area           ),
4014    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_copy_area           ),
4015    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
4016    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
4017    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
4018    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
4019    PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   mmx_composite_copy_area           ),
4020    PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   mmx_composite_copy_area           ),
4021
4022    PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
4023    PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
4024
4025    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          a8r8g8b8, mmx_8888_8888                     ),
4026    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
4027    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
4028    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          a8b8g8r8, mmx_8888_8888                     ),
4029    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
4030    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
4031
4032    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         x8r8g8b8, mmx_8888_8888                     ),
4033    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         x8b8g8r8, mmx_8888_8888                     ),
4034    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         a8r8g8b8, mmx_8888_8888                     ),
4035    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         a8b8g8r8, mmx_8888_8888                     ),
4036
4037    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888                   ),
4038    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888                   ),
4039    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888                   ),
4040    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888                   ),
4041
4042    { PIXMAN_OP_NONE },
4043};
4044
4045pixman_implementation_t *
4046_pixman_implementation_create_mmx (pixman_implementation_t *fallback)
4047{
4048    pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
4049
4050    imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
4051    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
4052    imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
4053    imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
4054    imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
4055    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
4056    imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
4057    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
4058    imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
4059    imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
4060    imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
4061
4062    imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
4063    imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
4064    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
4065    imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
4066    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
4067    imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
4068    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
4069    imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
4070    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
4071    imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
4072    imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
4073
4074    imp->blt = mmx_blt;
4075    imp->fill = mmx_fill;
4076
4077    imp->src_iter_init = mmx_src_iter_init;
4078
4079    return imp;
4080}
4081
4082#endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */
4083