1/*
2 * Copyright © 2007 Luca Barbato
3 *
4 * Permission to use, copy, modify, distribute, and sell this software and its
5 * documentation for any purpose is hereby granted without fee, provided that
6 * the above copyright notice appear in all copies and that both that
7 * copyright notice and this permission notice appear in supporting
8 * documentation, and that the name of Luca Barbato not be used in advertising or
9 * publicity pertaining to distribution of the software without specific,
10 * written prior permission.  Luca Barbato makes no representations about the
11 * suitability of this software for any purpose.  It is provided "as is"
12 * without express or implied warranty.
13 *
14 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
15 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
16 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
17 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
19 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
20 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
21 * SOFTWARE.
22 *
23 * Author:  Luca Barbato (lu_zero@gentoo.org)
24 *
25 * Based on fbmmx.c by Owen Taylor, Søren Sandmann and Nicholas Miell
26 */
27
28#include <config.h>
29#include "pixman-private.h"
30#include "pixman-combine32.h"
31#include <altivec.h>
32
33#define AVV(x...) {x}
34
35static force_inline vector unsigned int
36splat_alpha (vector unsigned int pix)
37{
38    return vec_perm (pix, pix,
39		     (vector unsigned char)AVV (
40			 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04,
41			 0x08, 0x08, 0x08, 0x08, 0x0C, 0x0C, 0x0C, 0x0C));
42}
43
44static force_inline vector unsigned int
45pix_multiply (vector unsigned int p, vector unsigned int a)
46{
47    vector unsigned short hi, lo, mod;
48
49    /* unpack to short */
50    hi = (vector unsigned short)
51	vec_mergeh ((vector unsigned char)AVV (0),
52		    (vector unsigned char)p);
53
54    mod = (vector unsigned short)
55	vec_mergeh ((vector unsigned char)AVV (0),
56		    (vector unsigned char)a);
57
58    hi = vec_mladd (hi, mod, (vector unsigned short)
59                    AVV (0x0080, 0x0080, 0x0080, 0x0080,
60                         0x0080, 0x0080, 0x0080, 0x0080));
61
62    hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8)));
63
64    hi = vec_sr (hi, vec_splat_u16 (8));
65
66    /* unpack to short */
67    lo = (vector unsigned short)
68	vec_mergel ((vector unsigned char)AVV (0),
69		    (vector unsigned char)p);
70    mod = (vector unsigned short)
71	vec_mergel ((vector unsigned char)AVV (0),
72		    (vector unsigned char)a);
73
74    lo = vec_mladd (lo, mod, (vector unsigned short)
75                    AVV (0x0080, 0x0080, 0x0080, 0x0080,
76                         0x0080, 0x0080, 0x0080, 0x0080));
77
78    lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8)));
79
80    lo = vec_sr (lo, vec_splat_u16 (8));
81
82    return (vector unsigned int)vec_packsu (hi, lo);
83}
84
85static force_inline vector unsigned int
86pix_add (vector unsigned int a, vector unsigned int b)
87{
88    return (vector unsigned int)vec_adds ((vector unsigned char)a,
89                                          (vector unsigned char)b);
90}
91
92static force_inline vector unsigned int
93pix_add_mul (vector unsigned int x,
94             vector unsigned int a,
95             vector unsigned int y,
96             vector unsigned int b)
97{
98    vector unsigned int t1, t2;
99
100    t1 = pix_multiply (x, a);
101    t2 = pix_multiply (y, b);
102
103    return pix_add (t1, t2);
104}
105
106static force_inline vector unsigned int
107negate (vector unsigned int src)
108{
109    return vec_nor (src, src);
110}
111
112/* dest*~srca + src */
113static force_inline vector unsigned int
114over (vector unsigned int src,
115      vector unsigned int srca,
116      vector unsigned int dest)
117{
118    vector unsigned char tmp = (vector unsigned char)
119	pix_multiply (dest, negate (srca));
120
121    tmp = vec_adds ((vector unsigned char)src, tmp);
122    return (vector unsigned int)tmp;
123}
124
125/* in == pix_multiply */
126#define in_over(src, srca, mask, dest)					\
127    over (pix_multiply (src, mask),					\
128          pix_multiply (srca, mask), dest)
129
130
131#define COMPUTE_SHIFT_MASK(source)					\
132    source ## _mask = vec_lvsl (0, source);
133
134#define COMPUTE_SHIFT_MASKS(dest, source)				\
135    dest ## _mask = vec_lvsl (0, dest);					\
136    source ## _mask = vec_lvsl (0, source);				\
137    store_mask = vec_lvsr (0, dest);
138
139#define COMPUTE_SHIFT_MASKC(dest, source, mask)				\
140    mask ## _mask = vec_lvsl (0, mask);					\
141    dest ## _mask = vec_lvsl (0, dest);					\
142    source ## _mask = vec_lvsl (0, source);				\
143    store_mask = vec_lvsr (0, dest);
144
145/* notice you have to declare temp vars...
146 * Note: tmp3 and tmp4 must remain untouched!
147 */
148
149#define LOAD_VECTORS(dest, source)			  \
150    tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
151    tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
152    tmp3 = (typeof(tmp3))vec_ld (0, dest);		  \
153    v ## source = (typeof(v ## source))			  \
154	vec_perm (tmp1, tmp2, source ## _mask);		  \
155    tmp4 = (typeof(tmp4))vec_ld (15, dest);		  \
156    v ## dest = (typeof(v ## dest))			  \
157	vec_perm (tmp3, tmp4, dest ## _mask);
158
159#define LOAD_VECTORSC(dest, source, mask)		  \
160    tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
161    tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
162    tmp3 = (typeof(tmp3))vec_ld (0, dest);		  \
163    v ## source = (typeof(v ## source))			  \
164	vec_perm (tmp1, tmp2, source ## _mask);		  \
165    tmp4 = (typeof(tmp4))vec_ld (15, dest);		  \
166    tmp1 = (typeof(tmp1))vec_ld (0, mask);		  \
167    v ## dest = (typeof(v ## dest))			  \
168	vec_perm (tmp3, tmp4, dest ## _mask);		  \
169    tmp2 = (typeof(tmp2))vec_ld (15, mask);		  \
170    v ## mask = (typeof(v ## mask))			  \
171	vec_perm (tmp1, tmp2, mask ## _mask);
172
173#define LOAD_VECTORSM(dest, source, mask)				\
174    LOAD_VECTORSC (dest, source, mask)					\
175    v ## source = pix_multiply (v ## source,				\
176                                splat_alpha (v ## mask));
177
178#define STORE_VECTOR(dest)						\
179    edges = vec_perm (tmp4, tmp3, dest ## _mask);			\
180    tmp3 = vec_perm ((vector unsigned char)v ## dest, edges, store_mask); \
181    tmp1 = vec_perm (edges, (vector unsigned char)v ## dest, store_mask); \
182    vec_st ((vector unsigned int) tmp3, 15, dest);			\
183    vec_st ((vector unsigned int) tmp1, 0, dest);
184
185static void
186vmx_combine_over_u_no_mask (uint32_t *      dest,
187                            const uint32_t *src,
188                            int             width)
189{
190    int i;
191    vector unsigned int vdest, vsrc;
192    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
193	dest_mask, src_mask, store_mask;
194
195    COMPUTE_SHIFT_MASKS (dest, src);
196
197    /* printf ("%s\n",__PRETTY_FUNCTION__); */
198    for (i = width / 4; i > 0; i--)
199    {
200
201	LOAD_VECTORS (dest, src);
202
203	vdest = over (vsrc, splat_alpha (vsrc), vdest);
204
205	STORE_VECTOR (dest);
206
207	src += 4;
208	dest += 4;
209    }
210
211    for (i = width % 4; --i >= 0;)
212    {
213	uint32_t s = src[i];
214	uint32_t d = dest[i];
215	uint32_t ia = ALPHA_8 (~s);
216
217	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
218
219	dest[i] = d;
220    }
221}
222
223static void
224vmx_combine_over_u_mask (uint32_t *      dest,
225                         const uint32_t *src,
226                         const uint32_t *mask,
227                         int             width)
228{
229    int i;
230    vector unsigned int vdest, vsrc, vmask;
231    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
232	dest_mask, src_mask, mask_mask, store_mask;
233
234    COMPUTE_SHIFT_MASKC (dest, src, mask);
235
236    /* printf ("%s\n",__PRETTY_FUNCTION__); */
237    for (i = width / 4; i > 0; i--)
238    {
239	LOAD_VECTORSM (dest, src, mask);
240
241	vdest = over (vsrc, splat_alpha (vsrc), vdest);
242
243	STORE_VECTOR (dest);
244
245	src += 4;
246	dest += 4;
247	mask += 4;
248    }
249
250    for (i = width % 4; --i >= 0;)
251    {
252	uint32_t m = ALPHA_8 (mask[i]);
253	uint32_t s = src[i];
254	uint32_t d = dest[i];
255	uint32_t ia;
256
257	UN8x4_MUL_UN8 (s, m);
258
259	ia = ALPHA_8 (~s);
260
261	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
262	dest[i] = d;
263    }
264}
265
266static void
267vmx_combine_over_u (pixman_implementation_t *imp,
268                    pixman_op_t              op,
269                    uint32_t *               dest,
270                    const uint32_t *         src,
271                    const uint32_t *         mask,
272                    int                      width)
273{
274    if (mask)
275	vmx_combine_over_u_mask (dest, src, mask, width);
276    else
277	vmx_combine_over_u_no_mask (dest, src, width);
278}
279
280static void
281vmx_combine_over_reverse_u_no_mask (uint32_t *      dest,
282                                    const uint32_t *src,
283                                    int             width)
284{
285    int i;
286    vector unsigned int vdest, vsrc;
287    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
288	dest_mask, src_mask, store_mask;
289
290    COMPUTE_SHIFT_MASKS (dest, src);
291
292    /* printf ("%s\n",__PRETTY_FUNCTION__); */
293    for (i = width / 4; i > 0; i--)
294    {
295
296	LOAD_VECTORS (dest, src);
297
298	vdest = over (vdest, splat_alpha (vdest), vsrc);
299
300	STORE_VECTOR (dest);
301
302	src += 4;
303	dest += 4;
304    }
305
306    for (i = width % 4; --i >= 0;)
307    {
308	uint32_t s = src[i];
309	uint32_t d = dest[i];
310	uint32_t ia = ALPHA_8 (~dest[i]);
311
312	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
313	dest[i] = s;
314    }
315}
316
317static void
318vmx_combine_over_reverse_u_mask (uint32_t *      dest,
319                                 const uint32_t *src,
320                                 const uint32_t *mask,
321                                 int             width)
322{
323    int i;
324    vector unsigned int vdest, vsrc, vmask;
325    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
326	dest_mask, src_mask, mask_mask, store_mask;
327
328    COMPUTE_SHIFT_MASKC (dest, src, mask);
329
330    /* printf ("%s\n",__PRETTY_FUNCTION__); */
331    for (i = width / 4; i > 0; i--)
332    {
333
334	LOAD_VECTORSM (dest, src, mask);
335
336	vdest = over (vdest, splat_alpha (vdest), vsrc);
337
338	STORE_VECTOR (dest);
339
340	src += 4;
341	dest += 4;
342	mask += 4;
343    }
344
345    for (i = width % 4; --i >= 0;)
346    {
347	uint32_t m = ALPHA_8 (mask[i]);
348	uint32_t s = src[i];
349	uint32_t d = dest[i];
350	uint32_t ia = ALPHA_8 (~dest[i]);
351
352	UN8x4_MUL_UN8 (s, m);
353
354	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
355	dest[i] = s;
356    }
357}
358
359static void
360vmx_combine_over_reverse_u (pixman_implementation_t *imp,
361                            pixman_op_t              op,
362                            uint32_t *               dest,
363                            const uint32_t *         src,
364                            const uint32_t *         mask,
365                            int                      width)
366{
367    if (mask)
368	vmx_combine_over_reverse_u_mask (dest, src, mask, width);
369    else
370	vmx_combine_over_reverse_u_no_mask (dest, src, width);
371}
372
373static void
374vmx_combine_in_u_no_mask (uint32_t *      dest,
375                          const uint32_t *src,
376                          int             width)
377{
378    int i;
379    vector unsigned int vdest, vsrc;
380    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
381	dest_mask, src_mask, store_mask;
382
383    COMPUTE_SHIFT_MASKS (dest, src);
384
385    /* printf ("%s\n",__PRETTY_FUNCTION__); */
386    for (i = width / 4; i > 0; i--)
387    {
388	LOAD_VECTORS (dest, src);
389
390	vdest = pix_multiply (vsrc, splat_alpha (vdest));
391
392	STORE_VECTOR (dest);
393
394	src += 4;
395	dest += 4;
396    }
397
398    for (i = width % 4; --i >= 0;)
399    {
400	uint32_t s = src[i];
401	uint32_t a = ALPHA_8 (dest[i]);
402
403	UN8x4_MUL_UN8 (s, a);
404	dest[i] = s;
405    }
406}
407
408static void
409vmx_combine_in_u_mask (uint32_t *      dest,
410                       const uint32_t *src,
411                       const uint32_t *mask,
412                       int             width)
413{
414    int i;
415    vector unsigned int vdest, vsrc, vmask;
416    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
417	dest_mask, src_mask, mask_mask, store_mask;
418
419    COMPUTE_SHIFT_MASKC (dest, src, mask);
420
421    /* printf ("%s\n",__PRETTY_FUNCTION__); */
422    for (i = width / 4; i > 0; i--)
423    {
424	LOAD_VECTORSM (dest, src, mask);
425
426	vdest = pix_multiply (vsrc, splat_alpha (vdest));
427
428	STORE_VECTOR (dest);
429
430	src += 4;
431	dest += 4;
432	mask += 4;
433    }
434
435    for (i = width % 4; --i >= 0;)
436    {
437	uint32_t m = ALPHA_8 (mask[i]);
438	uint32_t s = src[i];
439	uint32_t a = ALPHA_8 (dest[i]);
440
441	UN8x4_MUL_UN8 (s, m);
442	UN8x4_MUL_UN8 (s, a);
443
444	dest[i] = s;
445    }
446}
447
448static void
449vmx_combine_in_u (pixman_implementation_t *imp,
450                  pixman_op_t              op,
451                  uint32_t *               dest,
452                  const uint32_t *         src,
453                  const uint32_t *         mask,
454                  int                      width)
455{
456    if (mask)
457	vmx_combine_in_u_mask (dest, src, mask, width);
458    else
459	vmx_combine_in_u_no_mask (dest, src, width);
460}
461
462static void
463vmx_combine_in_reverse_u_no_mask (uint32_t *      dest,
464                                  const uint32_t *src,
465                                  int             width)
466{
467    int i;
468    vector unsigned int vdest, vsrc;
469    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
470	dest_mask, src_mask, store_mask;
471
472    COMPUTE_SHIFT_MASKS (dest, src);
473
474    /* printf ("%s\n",__PRETTY_FUNCTION__); */
475    for (i = width / 4; i > 0; i--)
476    {
477	LOAD_VECTORS (dest, src);
478
479	vdest = pix_multiply (vdest, splat_alpha (vsrc));
480
481	STORE_VECTOR (dest);
482
483	src += 4;
484	dest += 4;
485    }
486
487    for (i = width % 4; --i >= 0;)
488    {
489	uint32_t d = dest[i];
490	uint32_t a = ALPHA_8 (src[i]);
491
492	UN8x4_MUL_UN8 (d, a);
493
494	dest[i] = d;
495    }
496}
497
498static void
499vmx_combine_in_reverse_u_mask (uint32_t *      dest,
500                               const uint32_t *src,
501                               const uint32_t *mask,
502                               int             width)
503{
504    int i;
505    vector unsigned int vdest, vsrc, vmask;
506    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
507	dest_mask, src_mask, mask_mask, store_mask;
508
509    COMPUTE_SHIFT_MASKC (dest, src, mask);
510
511    /* printf ("%s\n",__PRETTY_FUNCTION__); */
512    for (i = width / 4; i > 0; i--)
513    {
514	LOAD_VECTORSM (dest, src, mask);
515
516	vdest = pix_multiply (vdest, splat_alpha (vsrc));
517
518	STORE_VECTOR (dest);
519
520	src += 4;
521	dest += 4;
522	mask += 4;
523    }
524
525    for (i = width % 4; --i >= 0;)
526    {
527	uint32_t m = ALPHA_8 (mask[i]);
528	uint32_t d = dest[i];
529	uint32_t a = src[i];
530
531	UN8x4_MUL_UN8 (a, m);
532	a = ALPHA_8 (a);
533	UN8x4_MUL_UN8 (d, a);
534
535	dest[i] = d;
536    }
537}
538
539static void
540vmx_combine_in_reverse_u (pixman_implementation_t *imp,
541                          pixman_op_t              op,
542                          uint32_t *               dest,
543                          const uint32_t *         src,
544                          const uint32_t *         mask,
545                          int                      width)
546{
547    if (mask)
548	vmx_combine_in_reverse_u_mask (dest, src, mask, width);
549    else
550	vmx_combine_in_reverse_u_no_mask (dest, src, width);
551}
552
553static void
554vmx_combine_out_u_no_mask (uint32_t *      dest,
555                           const uint32_t *src,
556                           int             width)
557{
558    int i;
559    vector unsigned int vdest, vsrc;
560    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
561	dest_mask, src_mask, store_mask;
562
563    COMPUTE_SHIFT_MASKS (dest, src);
564
565    /* printf ("%s\n",__PRETTY_FUNCTION__); */
566    for (i = width / 4; i > 0; i--)
567    {
568	LOAD_VECTORS (dest, src);
569
570	vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
571
572	STORE_VECTOR (dest);
573
574	src += 4;
575	dest += 4;
576    }
577
578    for (i = width % 4; --i >= 0;)
579    {
580	uint32_t s = src[i];
581	uint32_t a = ALPHA_8 (~dest[i]);
582
583	UN8x4_MUL_UN8 (s, a);
584
585	dest[i] = s;
586    }
587}
588
589static void
590vmx_combine_out_u_mask (uint32_t *      dest,
591                        const uint32_t *src,
592                        const uint32_t *mask,
593                        int             width)
594{
595    int i;
596    vector unsigned int vdest, vsrc, vmask;
597    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
598	dest_mask, src_mask, mask_mask, store_mask;
599
600    COMPUTE_SHIFT_MASKC (dest, src, mask);
601
602    /* printf ("%s\n",__PRETTY_FUNCTION__); */
603    for (i = width / 4; i > 0; i--)
604    {
605	LOAD_VECTORSM (dest, src, mask);
606
607	vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
608
609	STORE_VECTOR (dest);
610
611	src += 4;
612	dest += 4;
613	mask += 4;
614    }
615
616    for (i = width % 4; --i >= 0;)
617    {
618	uint32_t m = ALPHA_8 (mask[i]);
619	uint32_t s = src[i];
620	uint32_t a = ALPHA_8 (~dest[i]);
621
622	UN8x4_MUL_UN8 (s, m);
623	UN8x4_MUL_UN8 (s, a);
624
625	dest[i] = s;
626    }
627}
628
629static void
630vmx_combine_out_u (pixman_implementation_t *imp,
631                   pixman_op_t              op,
632                   uint32_t *               dest,
633                   const uint32_t *         src,
634                   const uint32_t *         mask,
635                   int                      width)
636{
637    if (mask)
638	vmx_combine_out_u_mask (dest, src, mask, width);
639    else
640	vmx_combine_out_u_no_mask (dest, src, width);
641}
642
643static void
644vmx_combine_out_reverse_u_no_mask (uint32_t *      dest,
645                                   const uint32_t *src,
646                                   int             width)
647{
648    int i;
649    vector unsigned int vdest, vsrc;
650    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
651	dest_mask, src_mask, store_mask;
652
653    COMPUTE_SHIFT_MASKS (dest, src);
654
655    /* printf ("%s\n",__PRETTY_FUNCTION__); */
656    for (i = width / 4; i > 0; i--)
657    {
658
659	LOAD_VECTORS (dest, src);
660
661	vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
662
663	STORE_VECTOR (dest);
664
665	src += 4;
666	dest += 4;
667    }
668
669    for (i = width % 4; --i >= 0;)
670    {
671	uint32_t d = dest[i];
672	uint32_t a = ALPHA_8 (~src[i]);
673
674	UN8x4_MUL_UN8 (d, a);
675
676	dest[i] = d;
677    }
678}
679
680static void
681vmx_combine_out_reverse_u_mask (uint32_t *      dest,
682                                const uint32_t *src,
683                                const uint32_t *mask,
684                                int             width)
685{
686    int i;
687    vector unsigned int vdest, vsrc, vmask;
688    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
689	dest_mask, src_mask, mask_mask, store_mask;
690
691    COMPUTE_SHIFT_MASKC (dest, src, mask);
692
693    /* printf ("%s\n",__PRETTY_FUNCTION__); */
694    for (i = width / 4; i > 0; i--)
695    {
696	LOAD_VECTORSM (dest, src, mask);
697
698	vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
699
700	STORE_VECTOR (dest);
701
702	src += 4;
703	dest += 4;
704	mask += 4;
705    }
706
707    for (i = width % 4; --i >= 0;)
708    {
709	uint32_t m = ALPHA_8 (mask[i]);
710	uint32_t d = dest[i];
711	uint32_t a = src[i];
712
713	UN8x4_MUL_UN8 (a, m);
714	a = ALPHA_8 (~a);
715	UN8x4_MUL_UN8 (d, a);
716
717	dest[i] = d;
718    }
719}
720
721static void
722vmx_combine_out_reverse_u (pixman_implementation_t *imp,
723                           pixman_op_t              op,
724                           uint32_t *               dest,
725                           const uint32_t *         src,
726                           const uint32_t *         mask,
727                           int                      width)
728{
729    if (mask)
730	vmx_combine_out_reverse_u_mask (dest, src, mask, width);
731    else
732	vmx_combine_out_reverse_u_no_mask (dest, src, width);
733}
734
735static void
736vmx_combine_atop_u_no_mask (uint32_t *      dest,
737                            const uint32_t *src,
738                            int             width)
739{
740    int i;
741    vector unsigned int vdest, vsrc;
742    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
743	dest_mask, src_mask, store_mask;
744
745    COMPUTE_SHIFT_MASKS (dest, src);
746
747    /* printf ("%s\n",__PRETTY_FUNCTION__); */
748    for (i = width / 4; i > 0; i--)
749    {
750	LOAD_VECTORS (dest, src);
751
752	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
753			     vdest, splat_alpha (negate (vsrc)));
754
755	STORE_VECTOR (dest);
756
757	src += 4;
758	dest += 4;
759    }
760
761    for (i = width % 4; --i >= 0;)
762    {
763	uint32_t s = src[i];
764	uint32_t d = dest[i];
765	uint32_t dest_a = ALPHA_8 (d);
766	uint32_t src_ia = ALPHA_8 (~s);
767
768	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
769
770	dest[i] = s;
771    }
772}
773
774static void
775vmx_combine_atop_u_mask (uint32_t *      dest,
776                         const uint32_t *src,
777                         const uint32_t *mask,
778                         int             width)
779{
780    int i;
781    vector unsigned int vdest, vsrc, vmask;
782    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
783	dest_mask, src_mask, mask_mask, store_mask;
784
785    COMPUTE_SHIFT_MASKC (dest, src, mask);
786
787    /* printf ("%s\n",__PRETTY_FUNCTION__); */
788    for (i = width / 4; i > 0; i--)
789    {
790	LOAD_VECTORSM (dest, src, mask);
791
792	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
793			     vdest, splat_alpha (negate (vsrc)));
794
795	STORE_VECTOR (dest);
796
797	src += 4;
798	dest += 4;
799	mask += 4;
800    }
801
802    for (i = width % 4; --i >= 0;)
803    {
804	uint32_t m = ALPHA_8 (mask[i]);
805	uint32_t s = src[i];
806	uint32_t d = dest[i];
807	uint32_t dest_a = ALPHA_8 (d);
808	uint32_t src_ia;
809
810	UN8x4_MUL_UN8 (s, m);
811
812	src_ia = ALPHA_8 (~s);
813
814	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
815
816	dest[i] = s;
817    }
818}
819
820static void
821vmx_combine_atop_u (pixman_implementation_t *imp,
822                    pixman_op_t              op,
823                    uint32_t *               dest,
824                    const uint32_t *         src,
825                    const uint32_t *         mask,
826                    int                      width)
827{
828    if (mask)
829	vmx_combine_atop_u_mask (dest, src, mask, width);
830    else
831	vmx_combine_atop_u_no_mask (dest, src, width);
832}
833
834static void
835vmx_combine_atop_reverse_u_no_mask (uint32_t *      dest,
836                                    const uint32_t *src,
837                                    int             width)
838{
839    int i;
840    vector unsigned int vdest, vsrc;
841    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
842	dest_mask, src_mask, store_mask;
843
844    COMPUTE_SHIFT_MASKS (dest, src);
845
846    /* printf ("%s\n",__PRETTY_FUNCTION__); */
847    for (i = width / 4; i > 0; i--)
848    {
849	LOAD_VECTORS (dest, src);
850
851	vdest = pix_add_mul (vdest, splat_alpha (vsrc),
852			     vsrc, splat_alpha (negate (vdest)));
853
854	STORE_VECTOR (dest);
855
856	src += 4;
857	dest += 4;
858    }
859
860    for (i = width % 4; --i >= 0;)
861    {
862	uint32_t s = src[i];
863	uint32_t d = dest[i];
864	uint32_t src_a = ALPHA_8 (s);
865	uint32_t dest_ia = ALPHA_8 (~d);
866
867	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
868
869	dest[i] = s;
870    }
871}
872
873static void
874vmx_combine_atop_reverse_u_mask (uint32_t *      dest,
875                                 const uint32_t *src,
876                                 const uint32_t *mask,
877                                 int             width)
878{
879    int i;
880    vector unsigned int vdest, vsrc, vmask;
881    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
882	dest_mask, src_mask, mask_mask, store_mask;
883
884    COMPUTE_SHIFT_MASKC (dest, src, mask);
885
886    /* printf ("%s\n",__PRETTY_FUNCTION__); */
887    for (i = width / 4; i > 0; i--)
888    {
889	LOAD_VECTORSM (dest, src, mask);
890
891	vdest = pix_add_mul (vdest, splat_alpha (vsrc),
892			     vsrc, splat_alpha (negate (vdest)));
893
894	STORE_VECTOR (dest);
895
896	src += 4;
897	dest += 4;
898	mask += 4;
899    }
900
901    for (i = width % 4; --i >= 0;)
902    {
903	uint32_t m = ALPHA_8 (mask[i]);
904	uint32_t s = src[i];
905	uint32_t d = dest[i];
906	uint32_t src_a;
907	uint32_t dest_ia = ALPHA_8 (~d);
908
909	UN8x4_MUL_UN8 (s, m);
910
911	src_a = ALPHA_8 (s);
912
913	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
914
915	dest[i] = s;
916    }
917}
918
919static void
920vmx_combine_atop_reverse_u (pixman_implementation_t *imp,
921                            pixman_op_t              op,
922                            uint32_t *               dest,
923                            const uint32_t *         src,
924                            const uint32_t *         mask,
925                            int                      width)
926{
927    if (mask)
928	vmx_combine_atop_reverse_u_mask (dest, src, mask, width);
929    else
930	vmx_combine_atop_reverse_u_no_mask (dest, src, width);
931}
932
933static void
934vmx_combine_xor_u_no_mask (uint32_t *      dest,
935                           const uint32_t *src,
936                           int             width)
937{
938    int i;
939    vector unsigned int vdest, vsrc;
940    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
941	dest_mask, src_mask, store_mask;
942
943    COMPUTE_SHIFT_MASKS (dest, src);
944
945    /* printf ("%s\n",__PRETTY_FUNCTION__); */
946    for (i = width / 4; i > 0; i--)
947    {
948	LOAD_VECTORS (dest, src);
949
950	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
951			     vdest, splat_alpha (negate (vsrc)));
952
953	STORE_VECTOR (dest);
954
955	src += 4;
956	dest += 4;
957    }
958
959    for (i = width % 4; --i >= 0;)
960    {
961	uint32_t s = src[i];
962	uint32_t d = dest[i];
963	uint32_t src_ia = ALPHA_8 (~s);
964	uint32_t dest_ia = ALPHA_8 (~d);
965
966	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
967
968	dest[i] = s;
969    }
970}
971
972static void
973vmx_combine_xor_u_mask (uint32_t *      dest,
974                        const uint32_t *src,
975                        const uint32_t *mask,
976                        int             width)
977{
978    int i;
979    vector unsigned int vdest, vsrc, vmask;
980    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
981	dest_mask, src_mask, mask_mask, store_mask;
982
983    COMPUTE_SHIFT_MASKC (dest, src, mask);
984
985    /* printf ("%s\n",__PRETTY_FUNCTION__); */
986    for (i = width / 4; i > 0; i--)
987    {
988	LOAD_VECTORSM (dest, src, mask);
989
990	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
991			     vdest, splat_alpha (negate (vsrc)));
992
993	STORE_VECTOR (dest);
994
995	src += 4;
996	dest += 4;
997	mask += 4;
998    }
999
1000    for (i = width % 4; --i >= 0;)
1001    {
1002	uint32_t m = ALPHA_8 (mask[i]);
1003	uint32_t s = src[i];
1004	uint32_t d = dest[i];
1005	uint32_t src_ia;
1006	uint32_t dest_ia = ALPHA_8 (~d);
1007
1008	UN8x4_MUL_UN8 (s, m);
1009
1010	src_ia = ALPHA_8 (~s);
1011
1012	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
1013
1014	dest[i] = s;
1015    }
1016}
1017
1018static void
1019vmx_combine_xor_u (pixman_implementation_t *imp,
1020                   pixman_op_t              op,
1021                   uint32_t *               dest,
1022                   const uint32_t *         src,
1023                   const uint32_t *         mask,
1024                   int                      width)
1025{
1026    if (mask)
1027	vmx_combine_xor_u_mask (dest, src, mask, width);
1028    else
1029	vmx_combine_xor_u_no_mask (dest, src, width);
1030}
1031
1032static void
1033vmx_combine_add_u_no_mask (uint32_t *      dest,
1034                           const uint32_t *src,
1035                           int             width)
1036{
1037    int i;
1038    vector unsigned int vdest, vsrc;
1039    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1040	dest_mask, src_mask, store_mask;
1041
1042    COMPUTE_SHIFT_MASKS (dest, src);
1043    /* printf ("%s\n",__PRETTY_FUNCTION__); */
1044    for (i = width / 4; i > 0; i--)
1045    {
1046	LOAD_VECTORS (dest, src);
1047
1048	vdest = pix_add (vsrc, vdest);
1049
1050	STORE_VECTOR (dest);
1051
1052	src += 4;
1053	dest += 4;
1054    }
1055
1056    for (i = width % 4; --i >= 0;)
1057    {
1058	uint32_t s = src[i];
1059	uint32_t d = dest[i];
1060
1061	UN8x4_ADD_UN8x4 (d, s);
1062
1063	dest[i] = d;
1064    }
1065}
1066
1067static void
1068vmx_combine_add_u_mask (uint32_t *      dest,
1069                        const uint32_t *src,
1070                        const uint32_t *mask,
1071                        int             width)
1072{
1073    int i;
1074    vector unsigned int vdest, vsrc, vmask;
1075    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1076	dest_mask, src_mask, mask_mask, store_mask;
1077
1078    COMPUTE_SHIFT_MASKC (dest, src, mask);
1079
1080    /* printf ("%s\n",__PRETTY_FUNCTION__); */
1081    for (i = width / 4; i > 0; i--)
1082    {
1083	LOAD_VECTORSM (dest, src, mask);
1084
1085	vdest = pix_add (vsrc, vdest);
1086
1087	STORE_VECTOR (dest);
1088
1089	src += 4;
1090	dest += 4;
1091	mask += 4;
1092    }
1093
1094    for (i = width % 4; --i >= 0;)
1095    {
1096	uint32_t m = ALPHA_8 (mask[i]);
1097	uint32_t s = src[i];
1098	uint32_t d = dest[i];
1099
1100	UN8x4_MUL_UN8 (s, m);
1101	UN8x4_ADD_UN8x4 (d, s);
1102
1103	dest[i] = d;
1104    }
1105}
1106
1107static void
1108vmx_combine_add_u (pixman_implementation_t *imp,
1109                   pixman_op_t              op,
1110                   uint32_t *               dest,
1111                   const uint32_t *         src,
1112                   const uint32_t *         mask,
1113                   int                      width)
1114{
1115    if (mask)
1116	vmx_combine_add_u_mask (dest, src, mask, width);
1117    else
1118	vmx_combine_add_u_no_mask (dest, src, width);
1119}
1120
1121static void
1122vmx_combine_src_ca (pixman_implementation_t *imp,
1123                    pixman_op_t              op,
1124                    uint32_t *               dest,
1125                    const uint32_t *         src,
1126                    const uint32_t *         mask,
1127                    int                      width)
1128{
1129    int i;
1130    vector unsigned int vdest, vsrc, vmask;
1131    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1132	dest_mask, mask_mask, src_mask, store_mask;
1133
1134    COMPUTE_SHIFT_MASKC (dest, src, mask);
1135
1136    /* printf ("%s\n",__PRETTY_FUNCTION__); */
1137    for (i = width / 4; i > 0; i--)
1138    {
1139	LOAD_VECTORSC (dest, src, mask);
1140
1141	vdest = pix_multiply (vsrc, vmask);
1142
1143	STORE_VECTOR (dest);
1144
1145	mask += 4;
1146	src += 4;
1147	dest += 4;
1148    }
1149
1150    for (i = width % 4; --i >= 0;)
1151    {
1152	uint32_t a = mask[i];
1153	uint32_t s = src[i];
1154
1155	UN8x4_MUL_UN8x4 (s, a);
1156
1157	dest[i] = s;
1158    }
1159}
1160
1161static void
1162vmx_combine_over_ca (pixman_implementation_t *imp,
1163                     pixman_op_t              op,
1164                     uint32_t *               dest,
1165                     const uint32_t *         src,
1166                     const uint32_t *         mask,
1167                     int                      width)
1168{
1169    int i;
1170    vector unsigned int vdest, vsrc, vmask;
1171    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1172	dest_mask, mask_mask, src_mask, store_mask;
1173
1174    COMPUTE_SHIFT_MASKC (dest, src, mask);
1175
1176    /* printf ("%s\n",__PRETTY_FUNCTION__); */
1177    for (i = width / 4; i > 0; i--)
1178    {
1179	LOAD_VECTORSC (dest, src, mask);
1180
1181	vdest = in_over (vsrc, splat_alpha (vsrc), vmask, vdest);
1182
1183	STORE_VECTOR (dest);
1184
1185	mask += 4;
1186	src += 4;
1187	dest += 4;
1188    }
1189
1190    for (i = width % 4; --i >= 0;)
1191    {
1192	uint32_t a = mask[i];
1193	uint32_t s = src[i];
1194	uint32_t d = dest[i];
1195	uint32_t sa = ALPHA_8 (s);
1196
1197	UN8x4_MUL_UN8x4 (s, a);
1198	UN8x4_MUL_UN8 (a, sa);
1199	UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s);
1200
1201	dest[i] = d;
1202    }
1203}
1204
1205static void
1206vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
1207                             pixman_op_t              op,
1208                             uint32_t *               dest,
1209                             const uint32_t *         src,
1210                             const uint32_t *         mask,
1211                             int                      width)
1212{
1213    int i;
1214    vector unsigned int vdest, vsrc, vmask;
1215    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1216	dest_mask, mask_mask, src_mask, store_mask;
1217
1218    COMPUTE_SHIFT_MASKC (dest, src, mask);
1219
1220    /* printf("%s\n",__PRETTY_FUNCTION__); */
1221    for (i = width / 4; i > 0; i--)
1222    {
1223	LOAD_VECTORSC (dest, src, mask);
1224
1225	vdest = over (vdest, splat_alpha (vdest), pix_multiply (vsrc, vmask));
1226
1227	STORE_VECTOR (dest);
1228
1229	mask += 4;
1230	src += 4;
1231	dest += 4;
1232    }
1233
1234    for (i = width % 4; --i >= 0;)
1235    {
1236	uint32_t a = mask[i];
1237	uint32_t s = src[i];
1238	uint32_t d = dest[i];
1239	uint32_t ida = ALPHA_8 (~d);
1240
1241	UN8x4_MUL_UN8x4 (s, a);
1242	UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d);
1243
1244	dest[i] = s;
1245    }
1246}
1247
1248static void
1249vmx_combine_in_ca (pixman_implementation_t *imp,
1250                   pixman_op_t              op,
1251                   uint32_t *               dest,
1252                   const uint32_t *         src,
1253                   const uint32_t *         mask,
1254                   int                      width)
1255{
1256    int i;
1257    vector unsigned int vdest, vsrc, vmask;
1258    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1259	dest_mask, mask_mask, src_mask, store_mask;
1260
1261    COMPUTE_SHIFT_MASKC (dest, src, mask);
1262
1263    /* printf ("%s\n",__PRETTY_FUNCTION__); */
1264    for (i = width / 4; i > 0; i--)
1265    {
1266	LOAD_VECTORSC (dest, src, mask);
1267
1268	vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest));
1269
1270	STORE_VECTOR (dest);
1271
1272	src += 4;
1273	dest += 4;
1274	mask += 4;
1275    }
1276
1277    for (i = width % 4; --i >= 0;)
1278    {
1279	uint32_t a = mask[i];
1280	uint32_t s = src[i];
1281	uint32_t da = ALPHA_8 (dest[i]);
1282
1283	UN8x4_MUL_UN8x4 (s, a);
1284	UN8x4_MUL_UN8 (s, da);
1285
1286	dest[i] = s;
1287    }
1288}
1289
1290static void
1291vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1292                           pixman_op_t              op,
1293                           uint32_t *               dest,
1294                           const uint32_t *         src,
1295                           const uint32_t *         mask,
1296                           int                      width)
1297{
1298    int i;
1299    vector unsigned int vdest, vsrc, vmask;
1300    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1301	dest_mask, mask_mask, src_mask, store_mask;
1302
1303    COMPUTE_SHIFT_MASKC (dest, src, mask);
1304
1305    /* printf ("%s\n",__PRETTY_FUNCTION__); */
1306    for (i = width / 4; i > 0; i--)
1307    {
1308
1309	LOAD_VECTORSC (dest, src, mask);
1310
1311	vdest = pix_multiply (vdest, pix_multiply (vmask, splat_alpha (vsrc)));
1312
1313	STORE_VECTOR (dest);
1314
1315	src += 4;
1316	dest += 4;
1317	mask += 4;
1318    }
1319
1320    for (i = width % 4; --i >= 0;)
1321    {
1322	uint32_t a = mask[i];
1323	uint32_t d = dest[i];
1324	uint32_t sa = ALPHA_8 (src[i]);
1325
1326	UN8x4_MUL_UN8 (a, sa);
1327	UN8x4_MUL_UN8x4 (d, a);
1328
1329	dest[i] = d;
1330    }
1331}
1332
1333static void
1334vmx_combine_out_ca (pixman_implementation_t *imp,
1335                    pixman_op_t              op,
1336                    uint32_t *               dest,
1337                    const uint32_t *         src,
1338                    const uint32_t *         mask,
1339                    int                      width)
1340{
1341    int i;
1342    vector unsigned int vdest, vsrc, vmask;
1343    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1344	dest_mask, mask_mask, src_mask, store_mask;
1345
1346    COMPUTE_SHIFT_MASKC (dest, src, mask);
1347
1348    /* printf ("%s\n",__PRETTY_FUNCTION__); */
1349    for (i = width / 4; i > 0; i--)
1350    {
1351	LOAD_VECTORSC (dest, src, mask);
1352
1353	vdest = pix_multiply (
1354	    pix_multiply (vsrc, vmask), splat_alpha (negate (vdest)));
1355
1356	STORE_VECTOR (dest);
1357
1358	src += 4;
1359	dest += 4;
1360	mask += 4;
1361    }
1362
1363    for (i = width % 4; --i >= 0;)
1364    {
1365	uint32_t a = mask[i];
1366	uint32_t s = src[i];
1367	uint32_t d = dest[i];
1368	uint32_t da = ALPHA_8 (~d);
1369
1370	UN8x4_MUL_UN8x4 (s, a);
1371	UN8x4_MUL_UN8 (s, da);
1372
1373	dest[i] = s;
1374    }
1375}
1376
1377static void
1378vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
1379                            pixman_op_t              op,
1380                            uint32_t *               dest,
1381                            const uint32_t *         src,
1382                            const uint32_t *         mask,
1383                            int                      width)
1384{
1385    int i;
1386    vector unsigned int vdest, vsrc, vmask;
1387    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1388	dest_mask, mask_mask, src_mask, store_mask;
1389
1390    COMPUTE_SHIFT_MASKC (dest, src, mask);
1391
1392    /* printf ("%s\n",__PRETTY_FUNCTION__); */
1393    for (i = width / 4; i > 0; i--)
1394    {
1395	LOAD_VECTORSC (dest, src, mask);
1396
1397	vdest = pix_multiply (
1398	    vdest, negate (pix_multiply (vmask, splat_alpha (vsrc))));
1399
1400	STORE_VECTOR (dest);
1401
1402	src += 4;
1403	dest += 4;
1404	mask += 4;
1405    }
1406
1407    for (i = width % 4; --i >= 0;)
1408    {
1409	uint32_t a = mask[i];
1410	uint32_t s = src[i];
1411	uint32_t d = dest[i];
1412	uint32_t sa = ALPHA_8 (s);
1413
1414	UN8x4_MUL_UN8 (a, sa);
1415	UN8x4_MUL_UN8x4 (d, ~a);
1416
1417	dest[i] = d;
1418    }
1419}
1420
1421static void
1422vmx_combine_atop_ca (pixman_implementation_t *imp,
1423                     pixman_op_t              op,
1424                     uint32_t *               dest,
1425                     const uint32_t *         src,
1426                     const uint32_t *         mask,
1427                     int                      width)
1428{
1429    int i;
1430    vector unsigned int vdest, vsrc, vmask, vsrca;
1431    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1432	dest_mask, mask_mask, src_mask, store_mask;
1433
1434    COMPUTE_SHIFT_MASKC (dest, src, mask);
1435
1436    /* printf ("%s\n",__PRETTY_FUNCTION__); */
1437    for (i = width / 4; i > 0; i--)
1438    {
1439	LOAD_VECTORSC (dest, src, mask);
1440
1441	vsrca = splat_alpha (vsrc);
1442
1443	vsrc = pix_multiply (vsrc, vmask);
1444	vmask = pix_multiply (vmask, vsrca);
1445
1446	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
1447			     negate (vmask), vdest);
1448
1449	STORE_VECTOR (dest);
1450
1451	src += 4;
1452	dest += 4;
1453	mask += 4;
1454    }
1455
1456    for (i = width % 4; --i >= 0;)
1457    {
1458	uint32_t a = mask[i];
1459	uint32_t s = src[i];
1460	uint32_t d = dest[i];
1461	uint32_t sa = ALPHA_8 (s);
1462	uint32_t da = ALPHA_8 (d);
1463
1464	UN8x4_MUL_UN8x4 (s, a);
1465	UN8x4_MUL_UN8 (a, sa);
1466	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
1467
1468	dest[i] = d;
1469    }
1470}
1471
1472static void
1473vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1474                             pixman_op_t              op,
1475                             uint32_t *               dest,
1476                             const uint32_t *         src,
1477                             const uint32_t *         mask,
1478                             int                      width)
1479{
1480    int i;
1481    vector unsigned int vdest, vsrc, vmask;
1482    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1483	dest_mask, mask_mask, src_mask, store_mask;
1484
1485    COMPUTE_SHIFT_MASKC (dest, src, mask);
1486
1487    /* printf ("%s\n",__PRETTY_FUNCTION__); */
1488    for (i = width / 4; i > 0; i--)
1489    {
1490	LOAD_VECTORSC (dest, src, mask);
1491
1492	vdest = pix_add_mul (vdest,
1493			     pix_multiply (vmask, splat_alpha (vsrc)),
1494			     pix_multiply (vsrc, vmask),
1495			     negate (splat_alpha (vdest)));
1496
1497	STORE_VECTOR (dest);
1498
1499	src += 4;
1500	dest += 4;
1501	mask += 4;
1502    }
1503
1504    for (i = width % 4; --i >= 0;)
1505    {
1506	uint32_t a = mask[i];
1507	uint32_t s = src[i];
1508	uint32_t d = dest[i];
1509	uint32_t sa = ALPHA_8 (s);
1510	uint32_t da = ALPHA_8 (~d);
1511
1512	UN8x4_MUL_UN8x4 (s, a);
1513	UN8x4_MUL_UN8 (a, sa);
1514	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da);
1515
1516	dest[i] = d;
1517    }
1518}
1519
1520static void
1521vmx_combine_xor_ca (pixman_implementation_t *imp,
1522                    pixman_op_t              op,
1523                    uint32_t *               dest,
1524                    const uint32_t *         src,
1525                    const uint32_t *         mask,
1526                    int                      width)
1527{
1528    int i;
1529    vector unsigned int vdest, vsrc, vmask;
1530    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1531	dest_mask, mask_mask, src_mask, store_mask;
1532
1533    COMPUTE_SHIFT_MASKC (dest, src, mask);
1534
1535    /* printf ("%s\n",__PRETTY_FUNCTION__); */
1536    for (i = width / 4; i > 0; i--)
1537    {
1538	LOAD_VECTORSC (dest, src, mask);
1539
1540	vdest = pix_add_mul (vdest,
1541			     negate (pix_multiply (vmask, splat_alpha (vsrc))),
1542			     pix_multiply (vsrc, vmask),
1543			     negate (splat_alpha (vdest)));
1544
1545	STORE_VECTOR (dest);
1546
1547	src += 4;
1548	dest += 4;
1549	mask += 4;
1550    }
1551
1552    for (i = width % 4; --i >= 0;)
1553    {
1554	uint32_t a = mask[i];
1555	uint32_t s = src[i];
1556	uint32_t d = dest[i];
1557	uint32_t sa = ALPHA_8 (s);
1558	uint32_t da = ALPHA_8 (~d);
1559
1560	UN8x4_MUL_UN8x4 (s, a);
1561	UN8x4_MUL_UN8 (a, sa);
1562	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
1563
1564	dest[i] = d;
1565    }
1566}
1567
1568static void
1569vmx_combine_add_ca (pixman_implementation_t *imp,
1570                    pixman_op_t              op,
1571                    uint32_t *               dest,
1572                    const uint32_t *         src,
1573                    const uint32_t *         mask,
1574                    int                      width)
1575{
1576    int i;
1577    vector unsigned int vdest, vsrc, vmask;
1578    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1579	dest_mask, mask_mask, src_mask, store_mask;
1580
1581    COMPUTE_SHIFT_MASKC (dest, src, mask);
1582
1583    /* printf ("%s\n",__PRETTY_FUNCTION__); */
1584    for (i = width / 4; i > 0; i--)
1585    {
1586	LOAD_VECTORSC (dest, src, mask);
1587
1588	vdest = pix_add (pix_multiply (vsrc, vmask), vdest);
1589
1590	STORE_VECTOR (dest);
1591
1592	src += 4;
1593	dest += 4;
1594	mask += 4;
1595    }
1596
1597    for (i = width % 4; --i >= 0;)
1598    {
1599	uint32_t a = mask[i];
1600	uint32_t s = src[i];
1601	uint32_t d = dest[i];
1602
1603	UN8x4_MUL_UN8x4 (s, a);
1604	UN8x4_ADD_UN8x4 (s, d);
1605
1606	dest[i] = s;
1607    }
1608}
1609
1610static const pixman_fast_path_t vmx_fast_paths[] =
1611{
1612    {   PIXMAN_OP_NONE	},
1613};
1614
1615pixman_implementation_t *
1616_pixman_implementation_create_vmx (pixman_implementation_t *fallback)
1617{
1618    pixman_implementation_t *imp = _pixman_implementation_create (fallback, vmx_fast_paths);
1619
1620    /* Set up function pointers */
1621
1622    imp->combine_32[PIXMAN_OP_OVER] = vmx_combine_over_u;
1623    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_u;
1624    imp->combine_32[PIXMAN_OP_IN] = vmx_combine_in_u;
1625    imp->combine_32[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_u;
1626    imp->combine_32[PIXMAN_OP_OUT] = vmx_combine_out_u;
1627    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_u;
1628    imp->combine_32[PIXMAN_OP_ATOP] = vmx_combine_atop_u;
1629    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_u;
1630    imp->combine_32[PIXMAN_OP_XOR] = vmx_combine_xor_u;
1631
1632    imp->combine_32[PIXMAN_OP_ADD] = vmx_combine_add_u;
1633
1634    imp->combine_32_ca[PIXMAN_OP_SRC] = vmx_combine_src_ca;
1635    imp->combine_32_ca[PIXMAN_OP_OVER] = vmx_combine_over_ca;
1636    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_ca;
1637    imp->combine_32_ca[PIXMAN_OP_IN] = vmx_combine_in_ca;
1638    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_ca;
1639    imp->combine_32_ca[PIXMAN_OP_OUT] = vmx_combine_out_ca;
1640    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_ca;
1641    imp->combine_32_ca[PIXMAN_OP_ATOP] = vmx_combine_atop_ca;
1642    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_ca;
1643    imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca;
1644    imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca;
1645
1646    return imp;
1647}
1648