1/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
2/*
3 * Copyright © 2000 SuSE, Inc.
4 * Copyright © 2007 Red Hat, Inc.
5 *
6 * Permission to use, copy, modify, distribute, and sell this software and its
7 * documentation for any purpose is hereby granted without fee, provided that
8 * the above copyright notice appear in all copies and that both that
9 * copyright notice and this permission notice appear in supporting
10 * documentation, and that the name of SuSE not be used in advertising or
11 * publicity pertaining to distribution of the software without specific,
12 * written prior permission.  SuSE makes no representations about the
13 * suitability of this software for any purpose.  It is provided "as is"
14 * without express or implied warranty.
15 *
16 * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
18 * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
20 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
21 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22 *
23 * Author:  Keith Packard, SuSE, Inc.
24 */
25
26#ifdef HAVE_CONFIG_H
27#include <config.h>
28#endif
29#include <string.h>
30#include <stdlib.h>
31#include "pixman-private.h"
32#include "pixman-combine32.h"
33#include "pixman-inlines.h"
34
35static force_inline uint32_t
36fetch_24 (uint8_t *a)
37{
38    if (((uintptr_t)a) & 1)
39    {
40#ifdef WORDS_BIGENDIAN
41	return (*a << 16) | (*(uint16_t *)(a + 1));
42#else
43	return *a | (*(uint16_t *)(a + 1) << 8);
44#endif
45    }
46    else
47    {
48#ifdef WORDS_BIGENDIAN
49	return (*(uint16_t *)a << 8) | *(a + 2);
50#else
51	return *(uint16_t *)a | (*(a + 2) << 16);
52#endif
53    }
54}
55
56static force_inline void
57store_24 (uint8_t *a,
58          uint32_t v)
59{
60    if (((uintptr_t)a) & 1)
61    {
62#ifdef WORDS_BIGENDIAN
63	*a = (uint8_t) (v >> 16);
64	*(uint16_t *)(a + 1) = (uint16_t) (v);
65#else
66	*a = (uint8_t) (v);
67	*(uint16_t *)(a + 1) = (uint16_t) (v >> 8);
68#endif
69    }
70    else
71    {
72#ifdef WORDS_BIGENDIAN
73	*(uint16_t *)a = (uint16_t)(v >> 8);
74	*(a + 2) = (uint8_t)v;
75#else
76	*(uint16_t *)a = (uint16_t)v;
77	*(a + 2) = (uint8_t)(v >> 16);
78#endif
79    }
80}
81
82static force_inline uint32_t
83over (uint32_t src,
84      uint32_t dest)
85{
86    uint32_t a = ~src >> 24;
87
88    UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src);
89
90    return dest;
91}
92
93static force_inline uint32_t
94in (uint32_t x,
95    uint8_t  y)
96{
97    uint16_t a = y;
98
99    UN8x4_MUL_UN8 (x, a);
100
101    return x;
102}
103
104/*
105 * Naming convention:
106 *
107 *  op_src_mask_dest
108 */
109static void
110fast_composite_over_x888_8_8888 (pixman_implementation_t *imp,
111                                 pixman_composite_info_t *info)
112{
113    PIXMAN_COMPOSITE_ARGS (info);
114    uint32_t    *src, *src_line;
115    uint32_t    *dst, *dst_line;
116    uint8_t     *mask, *mask_line;
117    int src_stride, mask_stride, dst_stride;
118    uint8_t m;
119    uint32_t s, d;
120    int32_t w;
121
122    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
123    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
124    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
125
126    while (height--)
127    {
128	src = src_line;
129	src_line += src_stride;
130	dst = dst_line;
131	dst_line += dst_stride;
132	mask = mask_line;
133	mask_line += mask_stride;
134
135	w = width;
136	while (w--)
137	{
138	    m = *mask++;
139	    if (m)
140	    {
141		s = *src | 0xff000000;
142
143		if (m == 0xff)
144		{
145		    *dst = s;
146		}
147		else
148		{
149		    d = in (s, m);
150		    *dst = over (d, *dst);
151		}
152	    }
153	    src++;
154	    dst++;
155	}
156    }
157}
158
159static void
160fast_composite_in_n_8_8 (pixman_implementation_t *imp,
161                         pixman_composite_info_t *info)
162{
163    PIXMAN_COMPOSITE_ARGS (info);
164    uint32_t src, srca;
165    uint8_t     *dst_line, *dst;
166    uint8_t     *mask_line, *mask, m;
167    int dst_stride, mask_stride;
168    int32_t w;
169    uint16_t t;
170
171    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
172
173    srca = src >> 24;
174
175    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
176    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
177
178    if (srca == 0xff)
179    {
180	while (height--)
181	{
182	    dst = dst_line;
183	    dst_line += dst_stride;
184	    mask = mask_line;
185	    mask_line += mask_stride;
186	    w = width;
187
188	    while (w--)
189	    {
190		m = *mask++;
191
192		if (m == 0)
193		    *dst = 0;
194		else if (m != 0xff)
195		    *dst = MUL_UN8 (m, *dst, t);
196
197		dst++;
198	    }
199	}
200    }
201    else
202    {
203	while (height--)
204	{
205	    dst = dst_line;
206	    dst_line += dst_stride;
207	    mask = mask_line;
208	    mask_line += mask_stride;
209	    w = width;
210
211	    while (w--)
212	    {
213		m = *mask++;
214		m = MUL_UN8 (m, srca, t);
215
216		if (m == 0)
217		    *dst = 0;
218		else if (m != 0xff)
219		    *dst = MUL_UN8 (m, *dst, t);
220
221		dst++;
222	    }
223	}
224    }
225}
226
227static void
228fast_composite_in_8_8 (pixman_implementation_t *imp,
229                       pixman_composite_info_t *info)
230{
231    PIXMAN_COMPOSITE_ARGS (info);
232    uint8_t     *dst_line, *dst;
233    uint8_t     *src_line, *src;
234    int dst_stride, src_stride;
235    int32_t w;
236    uint8_t s;
237    uint16_t t;
238
239    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
240    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
241
242    while (height--)
243    {
244	dst = dst_line;
245	dst_line += dst_stride;
246	src = src_line;
247	src_line += src_stride;
248	w = width;
249
250	while (w--)
251	{
252	    s = *src++;
253
254	    if (s == 0)
255		*dst = 0;
256	    else if (s != 0xff)
257		*dst = MUL_UN8 (s, *dst, t);
258
259	    dst++;
260	}
261    }
262}
263
264static void
265fast_composite_over_n_8_8888 (pixman_implementation_t *imp,
266                              pixman_composite_info_t *info)
267{
268    PIXMAN_COMPOSITE_ARGS (info);
269    uint32_t src, srca;
270    uint32_t    *dst_line, *dst, d;
271    uint8_t     *mask_line, *mask, m;
272    int dst_stride, mask_stride;
273    int32_t w;
274
275    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
276
277    srca = src >> 24;
278    if (src == 0)
279	return;
280
281    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
282    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
283
284    while (height--)
285    {
286	dst = dst_line;
287	dst_line += dst_stride;
288	mask = mask_line;
289	mask_line += mask_stride;
290	w = width;
291
292	while (w--)
293	{
294	    m = *mask++;
295	    if (m == 0xff)
296	    {
297		if (srca == 0xff)
298		    *dst = src;
299		else
300		    *dst = over (src, *dst);
301	    }
302	    else if (m)
303	    {
304		d = in (src, m);
305		*dst = over (d, *dst);
306	    }
307	    dst++;
308	}
309    }
310}
311
312static void
313fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
314				   pixman_composite_info_t *info)
315{
316    PIXMAN_COMPOSITE_ARGS (info);
317    uint32_t src, s;
318    uint32_t    *dst_line, *dst, d;
319    uint32_t    *mask_line, *mask, ma;
320    int dst_stride, mask_stride;
321    int32_t w;
322
323    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
324
325    if (src == 0)
326	return;
327
328    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
329    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
330
331    while (height--)
332    {
333	dst = dst_line;
334	dst_line += dst_stride;
335	mask = mask_line;
336	mask_line += mask_stride;
337	w = width;
338
339	while (w--)
340	{
341	    ma = *mask++;
342
343	    if (ma)
344	    {
345		d = *dst;
346		s = src;
347
348		UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ma, d);
349
350		*dst = s;
351	    }
352
353	    dst++;
354	}
355    }
356}
357
358static void
359fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
360                                    pixman_composite_info_t *info)
361{
362    PIXMAN_COMPOSITE_ARGS (info);
363    uint32_t src, srca, s;
364    uint32_t    *dst_line, *dst, d;
365    uint32_t    *mask_line, *mask, ma;
366    int dst_stride, mask_stride;
367    int32_t w;
368
369    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
370
371    srca = src >> 24;
372    if (src == 0)
373	return;
374
375    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
376    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
377
378    while (height--)
379    {
380	dst = dst_line;
381	dst_line += dst_stride;
382	mask = mask_line;
383	mask_line += mask_stride;
384	w = width;
385
386	while (w--)
387	{
388	    ma = *mask++;
389	    if (ma == 0xffffffff)
390	    {
391		if (srca == 0xff)
392		    *dst = src;
393		else
394		    *dst = over (src, *dst);
395	    }
396	    else if (ma)
397	    {
398		d = *dst;
399		s = src;
400
401		UN8x4_MUL_UN8x4 (s, ma);
402		UN8x4_MUL_UN8 (ma, srca);
403		ma = ~ma;
404		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
405
406		*dst = d;
407	    }
408
409	    dst++;
410	}
411    }
412}
413
414static void
415fast_composite_over_n_8_0888 (pixman_implementation_t *imp,
416                              pixman_composite_info_t *info)
417{
418    PIXMAN_COMPOSITE_ARGS (info);
419    uint32_t src, srca;
420    uint8_t     *dst_line, *dst;
421    uint32_t d;
422    uint8_t     *mask_line, *mask, m;
423    int dst_stride, mask_stride;
424    int32_t w;
425
426    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
427
428    srca = src >> 24;
429    if (src == 0)
430	return;
431
432    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
433    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
434
435    while (height--)
436    {
437	dst = dst_line;
438	dst_line += dst_stride;
439	mask = mask_line;
440	mask_line += mask_stride;
441	w = width;
442
443	while (w--)
444	{
445	    m = *mask++;
446	    if (m == 0xff)
447	    {
448		if (srca == 0xff)
449		{
450		    d = src;
451		}
452		else
453		{
454		    d = fetch_24 (dst);
455		    d = over (src, d);
456		}
457		store_24 (dst, d);
458	    }
459	    else if (m)
460	    {
461		d = over (in (src, m), fetch_24 (dst));
462		store_24 (dst, d);
463	    }
464	    dst += 3;
465	}
466    }
467}
468
469static void
470fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
471                              pixman_composite_info_t *info)
472{
473    PIXMAN_COMPOSITE_ARGS (info);
474    uint32_t src, srca;
475    uint16_t    *dst_line, *dst;
476    uint32_t d;
477    uint8_t     *mask_line, *mask, m;
478    int dst_stride, mask_stride;
479    int32_t w;
480
481    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
482
483    srca = src >> 24;
484    if (src == 0)
485	return;
486
487    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
488    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
489
490    while (height--)
491    {
492	dst = dst_line;
493	dst_line += dst_stride;
494	mask = mask_line;
495	mask_line += mask_stride;
496	w = width;
497
498	while (w--)
499	{
500	    m = *mask++;
501	    if (m == 0xff)
502	    {
503		if (srca == 0xff)
504		{
505		    d = src;
506		}
507		else
508		{
509		    d = *dst;
510		    d = over (src, convert_0565_to_0888 (d));
511		}
512		*dst = convert_8888_to_0565 (d);
513	    }
514	    else if (m)
515	    {
516		d = *dst;
517		d = over (in (src, m), convert_0565_to_0888 (d));
518		*dst = convert_8888_to_0565 (d);
519	    }
520	    dst++;
521	}
522    }
523}
524
525static void
526fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
527                                    pixman_composite_info_t *info)
528{
529    PIXMAN_COMPOSITE_ARGS (info);
530    uint32_t  src, srca, s;
531    uint16_t  src16;
532    uint16_t *dst_line, *dst;
533    uint32_t  d;
534    uint32_t *mask_line, *mask, ma;
535    int dst_stride, mask_stride;
536    int32_t w;
537
538    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
539
540    srca = src >> 24;
541    if (src == 0)
542	return;
543
544    src16 = convert_8888_to_0565 (src);
545
546    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
547    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
548
549    while (height--)
550    {
551	dst = dst_line;
552	dst_line += dst_stride;
553	mask = mask_line;
554	mask_line += mask_stride;
555	w = width;
556
557	while (w--)
558	{
559	    ma = *mask++;
560	    if (ma == 0xffffffff)
561	    {
562		if (srca == 0xff)
563		{
564		    *dst = src16;
565		}
566		else
567		{
568		    d = *dst;
569		    d = over (src, convert_0565_to_0888 (d));
570		    *dst = convert_8888_to_0565 (d);
571		}
572	    }
573	    else if (ma)
574	    {
575		d = *dst;
576		d = convert_0565_to_0888 (d);
577
578		s = src;
579
580		UN8x4_MUL_UN8x4 (s, ma);
581		UN8x4_MUL_UN8 (ma, srca);
582		ma = ~ma;
583		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
584
585		*dst = convert_8888_to_0565 (d);
586	    }
587	    dst++;
588	}
589    }
590}
591
592static void
593fast_composite_over_8888_8888 (pixman_implementation_t *imp,
594                               pixman_composite_info_t *info)
595{
596    PIXMAN_COMPOSITE_ARGS (info);
597    uint32_t    *dst_line, *dst;
598    uint32_t    *src_line, *src, s;
599    int dst_stride, src_stride;
600    uint8_t a;
601    int32_t w;
602
603    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
604    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
605
606    while (height--)
607    {
608	dst = dst_line;
609	dst_line += dst_stride;
610	src = src_line;
611	src_line += src_stride;
612	w = width;
613
614	while (w--)
615	{
616	    s = *src++;
617	    a = s >> 24;
618	    if (a == 0xff)
619		*dst = s;
620	    else if (s)
621		*dst = over (s, *dst);
622	    dst++;
623	}
624    }
625}
626
627static void
628fast_composite_src_x888_8888 (pixman_implementation_t *imp,
629			      pixman_composite_info_t *info)
630{
631    PIXMAN_COMPOSITE_ARGS (info);
632    uint32_t    *dst_line, *dst;
633    uint32_t    *src_line, *src;
634    int dst_stride, src_stride;
635    int32_t w;
636
637    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
638    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
639
640    while (height--)
641    {
642	dst = dst_line;
643	dst_line += dst_stride;
644	src = src_line;
645	src_line += src_stride;
646	w = width;
647
648	while (w--)
649	    *dst++ = (*src++) | 0xff000000;
650    }
651}
652
653#if 0
654static void
655fast_composite_over_8888_0888 (pixman_implementation_t *imp,
656			       pixman_composite_info_t *info)
657{
658    PIXMAN_COMPOSITE_ARGS (info);
659    uint8_t     *dst_line, *dst;
660    uint32_t d;
661    uint32_t    *src_line, *src, s;
662    uint8_t a;
663    int dst_stride, src_stride;
664    int32_t w;
665
666    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
667    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
668
669    while (height--)
670    {
671	dst = dst_line;
672	dst_line += dst_stride;
673	src = src_line;
674	src_line += src_stride;
675	w = width;
676
677	while (w--)
678	{
679	    s = *src++;
680	    a = s >> 24;
681	    if (a)
682	    {
683		if (a == 0xff)
684		    d = s;
685		else
686		    d = over (s, fetch_24 (dst));
687
688		store_24 (dst, d);
689	    }
690	    dst += 3;
691	}
692    }
693}
694#endif
695
696static void
697fast_composite_over_8888_0565 (pixman_implementation_t *imp,
698                               pixman_composite_info_t *info)
699{
700    PIXMAN_COMPOSITE_ARGS (info);
701    uint16_t    *dst_line, *dst;
702    uint32_t d;
703    uint32_t    *src_line, *src, s;
704    uint8_t a;
705    int dst_stride, src_stride;
706    int32_t w;
707
708    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
709    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
710
711    while (height--)
712    {
713	dst = dst_line;
714	dst_line += dst_stride;
715	src = src_line;
716	src_line += src_stride;
717	w = width;
718
719	while (w--)
720	{
721	    s = *src++;
722	    a = s >> 24;
723	    if (s)
724	    {
725		if (a == 0xff)
726		{
727		    d = s;
728		}
729		else
730		{
731		    d = *dst;
732		    d = over (s, convert_0565_to_0888 (d));
733		}
734		*dst = convert_8888_to_0565 (d);
735	    }
736	    dst++;
737	}
738    }
739}
740
741static void
742fast_composite_add_8_8 (pixman_implementation_t *imp,
743			pixman_composite_info_t *info)
744{
745    PIXMAN_COMPOSITE_ARGS (info);
746    uint8_t     *dst_line, *dst;
747    uint8_t     *src_line, *src;
748    int dst_stride, src_stride;
749    int32_t w;
750    uint8_t s, d;
751    uint16_t t;
752
753    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
754    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
755
756    while (height--)
757    {
758	dst = dst_line;
759	dst_line += dst_stride;
760	src = src_line;
761	src_line += src_stride;
762	w = width;
763
764	while (w--)
765	{
766	    s = *src++;
767	    if (s)
768	    {
769		if (s != 0xff)
770		{
771		    d = *dst;
772		    t = d + s;
773		    s = t | (0 - (t >> 8));
774		}
775		*dst = s;
776	    }
777	    dst++;
778	}
779    }
780}
781
782static void
783fast_composite_add_0565_0565 (pixman_implementation_t *imp,
784                              pixman_composite_info_t *info)
785{
786    PIXMAN_COMPOSITE_ARGS (info);
787    uint16_t    *dst_line, *dst;
788    uint32_t	d;
789    uint16_t    *src_line, *src;
790    uint32_t	s;
791    int dst_stride, src_stride;
792    int32_t w;
793
794    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
795    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
796
797    while (height--)
798    {
799	dst = dst_line;
800	dst_line += dst_stride;
801	src = src_line;
802	src_line += src_stride;
803	w = width;
804
805	while (w--)
806	{
807	    s = *src++;
808	    if (s)
809	    {
810		d = *dst;
811		s = convert_0565_to_8888 (s);
812		if (d)
813		{
814		    d = convert_0565_to_8888 (d);
815		    UN8x4_ADD_UN8x4 (s, d);
816		}
817		*dst = convert_8888_to_0565 (s);
818	    }
819	    dst++;
820	}
821    }
822}
823
824static void
825fast_composite_add_8888_8888 (pixman_implementation_t *imp,
826                              pixman_composite_info_t *info)
827{
828    PIXMAN_COMPOSITE_ARGS (info);
829    uint32_t    *dst_line, *dst;
830    uint32_t    *src_line, *src;
831    int dst_stride, src_stride;
832    int32_t w;
833    uint32_t s, d;
834
835    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
836    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
837
838    while (height--)
839    {
840	dst = dst_line;
841	dst_line += dst_stride;
842	src = src_line;
843	src_line += src_stride;
844	w = width;
845
846	while (w--)
847	{
848	    s = *src++;
849	    if (s)
850	    {
851		if (s != 0xffffffff)
852		{
853		    d = *dst;
854		    if (d)
855			UN8x4_ADD_UN8x4 (s, d);
856		}
857		*dst = s;
858	    }
859	    dst++;
860	}
861    }
862}
863
864static void
865fast_composite_add_n_8_8 (pixman_implementation_t *imp,
866			  pixman_composite_info_t *info)
867{
868    PIXMAN_COMPOSITE_ARGS (info);
869    uint8_t     *dst_line, *dst;
870    uint8_t     *mask_line, *mask;
871    int dst_stride, mask_stride;
872    int32_t w;
873    uint32_t src;
874    uint8_t sa;
875
876    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
877    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
878    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
879    sa = (src >> 24);
880
881    while (height--)
882    {
883	dst = dst_line;
884	dst_line += dst_stride;
885	mask = mask_line;
886	mask_line += mask_stride;
887	w = width;
888
889	while (w--)
890	{
891	    uint16_t tmp;
892	    uint16_t a;
893	    uint32_t m, d;
894	    uint32_t r;
895
896	    a = *mask++;
897	    d = *dst;
898
899	    m = MUL_UN8 (sa, a, tmp);
900	    r = ADD_UN8 (m, d, tmp);
901
902	    *dst++ = r;
903	}
904    }
905}
906
907#ifdef WORDS_BIGENDIAN
908#define CREATE_BITMASK(n) (0x80000000 >> (n))
909#define UPDATE_BITMASK(n) ((n) >> 1)
910#else
911#define CREATE_BITMASK(n) (1 << (n))
912#define UPDATE_BITMASK(n) ((n) << 1)
913#endif
914
915#define TEST_BIT(p, n)					\
916    (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31))
917#define SET_BIT(p, n)							\
918    do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0);
919
920static void
921fast_composite_add_1_1 (pixman_implementation_t *imp,
922			pixman_composite_info_t *info)
923{
924    PIXMAN_COMPOSITE_ARGS (info);
925    uint32_t     *dst_line, *dst;
926    uint32_t     *src_line, *src;
927    int           dst_stride, src_stride;
928    int32_t       w;
929
930    PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t,
931                           src_stride, src_line, 1);
932    PIXMAN_IMAGE_GET_LINE (dest_image, 0, dest_y, uint32_t,
933                           dst_stride, dst_line, 1);
934
935    while (height--)
936    {
937	dst = dst_line;
938	dst_line += dst_stride;
939	src = src_line;
940	src_line += src_stride;
941	w = width;
942
943	while (w--)
944	{
945	    /*
946	     * TODO: improve performance by processing uint32_t data instead
947	     *       of individual bits
948	     */
949	    if (TEST_BIT (src, src_x + w))
950		SET_BIT (dst, dest_x + w);
951	}
952    }
953}
954
955static void
956fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
957                              pixman_composite_info_t *info)
958{
959    PIXMAN_COMPOSITE_ARGS (info);
960    uint32_t     src, srca;
961    uint32_t    *dst, *dst_line;
962    uint32_t    *mask, *mask_line;
963    int          mask_stride, dst_stride;
964    uint32_t     bitcache, bitmask;
965    int32_t      w;
966
967    if (width <= 0)
968	return;
969
970    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
971    srca = src >> 24;
972    if (src == 0)
973	return;
974
975    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t,
976                           dst_stride, dst_line, 1);
977    PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
978                           mask_stride, mask_line, 1);
979    mask_line += mask_x >> 5;
980
981    if (srca == 0xff)
982    {
983	while (height--)
984	{
985	    dst = dst_line;
986	    dst_line += dst_stride;
987	    mask = mask_line;
988	    mask_line += mask_stride;
989	    w = width;
990
991	    bitcache = *mask++;
992	    bitmask = CREATE_BITMASK (mask_x & 31);
993
994	    while (w--)
995	    {
996		if (bitmask == 0)
997		{
998		    bitcache = *mask++;
999		    bitmask = CREATE_BITMASK (0);
1000		}
1001		if (bitcache & bitmask)
1002		    *dst = src;
1003		bitmask = UPDATE_BITMASK (bitmask);
1004		dst++;
1005	    }
1006	}
1007    }
1008    else
1009    {
1010	while (height--)
1011	{
1012	    dst = dst_line;
1013	    dst_line += dst_stride;
1014	    mask = mask_line;
1015	    mask_line += mask_stride;
1016	    w = width;
1017
1018	    bitcache = *mask++;
1019	    bitmask = CREATE_BITMASK (mask_x & 31);
1020
1021	    while (w--)
1022	    {
1023		if (bitmask == 0)
1024		{
1025		    bitcache = *mask++;
1026		    bitmask = CREATE_BITMASK (0);
1027		}
1028		if (bitcache & bitmask)
1029		    *dst = over (src, *dst);
1030		bitmask = UPDATE_BITMASK (bitmask);
1031		dst++;
1032	    }
1033	}
1034    }
1035}
1036
1037static void
1038fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
1039                              pixman_composite_info_t *info)
1040{
1041    PIXMAN_COMPOSITE_ARGS (info);
1042    uint32_t     src, srca;
1043    uint16_t    *dst, *dst_line;
1044    uint32_t    *mask, *mask_line;
1045    int          mask_stride, dst_stride;
1046    uint32_t     bitcache, bitmask;
1047    int32_t      w;
1048    uint32_t     d;
1049    uint16_t     src565;
1050
1051    if (width <= 0)
1052	return;
1053
1054    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1055    srca = src >> 24;
1056    if (src == 0)
1057	return;
1058
1059    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t,
1060                           dst_stride, dst_line, 1);
1061    PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
1062                           mask_stride, mask_line, 1);
1063    mask_line += mask_x >> 5;
1064
1065    if (srca == 0xff)
1066    {
1067	src565 = convert_8888_to_0565 (src);
1068	while (height--)
1069	{
1070	    dst = dst_line;
1071	    dst_line += dst_stride;
1072	    mask = mask_line;
1073	    mask_line += mask_stride;
1074	    w = width;
1075
1076	    bitcache = *mask++;
1077	    bitmask = CREATE_BITMASK (mask_x & 31);
1078
1079	    while (w--)
1080	    {
1081		if (bitmask == 0)
1082		{
1083		    bitcache = *mask++;
1084		    bitmask = CREATE_BITMASK (0);
1085		}
1086		if (bitcache & bitmask)
1087		    *dst = src565;
1088		bitmask = UPDATE_BITMASK (bitmask);
1089		dst++;
1090	    }
1091	}
1092    }
1093    else
1094    {
1095	while (height--)
1096	{
1097	    dst = dst_line;
1098	    dst_line += dst_stride;
1099	    mask = mask_line;
1100	    mask_line += mask_stride;
1101	    w = width;
1102
1103	    bitcache = *mask++;
1104	    bitmask = CREATE_BITMASK (mask_x & 31);
1105
1106	    while (w--)
1107	    {
1108		if (bitmask == 0)
1109		{
1110		    bitcache = *mask++;
1111		    bitmask = CREATE_BITMASK (0);
1112		}
1113		if (bitcache & bitmask)
1114		{
1115		    d = over (src, convert_0565_to_0888 (*dst));
1116		    *dst = convert_8888_to_0565 (d);
1117		}
1118		bitmask = UPDATE_BITMASK (bitmask);
1119		dst++;
1120	    }
1121	}
1122    }
1123}
1124
1125/*
1126 * Simple bitblt
1127 */
1128
1129static void
1130fast_composite_solid_fill (pixman_implementation_t *imp,
1131                           pixman_composite_info_t *info)
1132{
1133    PIXMAN_COMPOSITE_ARGS (info);
1134    uint32_t src;
1135
1136    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1137
1138    if (dest_image->bits.format == PIXMAN_a1)
1139    {
1140	src = src >> 31;
1141    }
1142    else if (dest_image->bits.format == PIXMAN_a8)
1143    {
1144	src = src >> 24;
1145    }
1146    else if (dest_image->bits.format == PIXMAN_r5g6b5 ||
1147             dest_image->bits.format == PIXMAN_b5g6r5)
1148    {
1149	src = convert_8888_to_0565 (src);
1150    }
1151
1152    pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
1153                 PIXMAN_FORMAT_BPP (dest_image->bits.format),
1154                 dest_x, dest_y,
1155                 width, height,
1156                 src);
1157}
1158
1159static void
1160fast_composite_src_memcpy (pixman_implementation_t *imp,
1161			   pixman_composite_info_t *info)
1162{
1163    PIXMAN_COMPOSITE_ARGS (info);
1164    int bpp = PIXMAN_FORMAT_BPP (dest_image->bits.format) / 8;
1165    uint32_t n_bytes = width * bpp;
1166    int dst_stride, src_stride;
1167    uint8_t    *dst;
1168    uint8_t    *src;
1169
1170    src_stride = src_image->bits.rowstride * 4;
1171    dst_stride = dest_image->bits.rowstride * 4;
1172
1173    src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp;
1174    dst = (uint8_t *)dest_image->bits.bits + dest_y * dst_stride + dest_x * bpp;
1175
1176    while (height--)
1177    {
1178	memcpy (dst, src, n_bytes);
1179
1180	dst += dst_stride;
1181	src += src_stride;
1182    }
1183}
1184
1185FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER)
1186FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE)
1187FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD)
1188FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL)
1189FAST_NEAREST (x888_8888_cover, x888, 8888, uint32_t, uint32_t, SRC, COVER)
1190FAST_NEAREST (x888_8888_pad, x888, 8888, uint32_t, uint32_t, SRC, PAD)
1191FAST_NEAREST (x888_8888_normal, x888, 8888, uint32_t, uint32_t, SRC, NORMAL)
1192FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER)
1193FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE)
1194FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD)
1195FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL)
1196FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER)
1197FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE)
1198FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD)
1199FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL)
1200FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL)
1201FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER)
1202FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE)
1203FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD)
1204FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL)
1205
1206#define REPEAT_MIN_WIDTH    32
1207
1208static void
1209fast_composite_tiled_repeat (pixman_implementation_t *imp,
1210			     pixman_composite_info_t *info)
1211{
1212    PIXMAN_COMPOSITE_ARGS (info);
1213    pixman_composite_func_t func;
1214    pixman_format_code_t mask_format;
1215    uint32_t src_flags, mask_flags;
1216    int32_t sx, sy;
1217    int32_t width_remain;
1218    int32_t num_pixels;
1219    int32_t src_width;
1220    int32_t i, j;
1221    pixman_image_t extended_src_image;
1222    uint32_t extended_src[REPEAT_MIN_WIDTH * 2];
1223    pixman_bool_t need_src_extension;
1224    uint32_t *src_line;
1225    int32_t src_stride;
1226    int32_t src_bpp;
1227    pixman_composite_info_t info2 = *info;
1228
1229    src_flags = (info->src_flags & ~FAST_PATH_NORMAL_REPEAT) |
1230		    FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
1231
1232    if (mask_image)
1233    {
1234	mask_format = mask_image->common.extended_format_code;
1235	mask_flags = info->mask_flags;
1236    }
1237    else
1238    {
1239	mask_format = PIXMAN_null;
1240	mask_flags = FAST_PATH_IS_OPAQUE;
1241    }
1242
1243    _pixman_implementation_lookup_composite (
1244	imp->toplevel, info->op,
1245	src_image->common.extended_format_code, src_flags,
1246	mask_format, mask_flags,
1247	dest_image->common.extended_format_code, info->dest_flags,
1248	&imp, &func);
1249
1250    src_bpp = PIXMAN_FORMAT_BPP (src_image->bits.format);
1251
1252    if (src_image->bits.width < REPEAT_MIN_WIDTH		&&
1253	(src_bpp == 32 || src_bpp == 16 || src_bpp == 8)	&&
1254	!src_image->bits.indexed)
1255    {
1256	sx = src_x;
1257	sx = MOD (sx, src_image->bits.width);
1258	sx += width;
1259	src_width = 0;
1260
1261	while (src_width < REPEAT_MIN_WIDTH && src_width <= sx)
1262	    src_width += src_image->bits.width;
1263
1264	src_stride = (src_width * (src_bpp >> 3) + 3) / (int) sizeof (uint32_t);
1265
1266	/* Initialize/validate stack-allocated temporary image */
1267	_pixman_bits_image_init (&extended_src_image, src_image->bits.format,
1268				 src_width, 1, &extended_src[0], src_stride,
1269				 FALSE);
1270	_pixman_image_validate (&extended_src_image);
1271
1272	info2.src_image = &extended_src_image;
1273	need_src_extension = TRUE;
1274    }
1275    else
1276    {
1277	src_width = src_image->bits.width;
1278	need_src_extension = FALSE;
1279    }
1280
1281    sx = src_x;
1282    sy = src_y;
1283
1284    while (--height >= 0)
1285    {
1286	sx = MOD (sx, src_width);
1287	sy = MOD (sy, src_image->bits.height);
1288
1289	if (need_src_extension)
1290	{
1291	    if (src_bpp == 32)
1292	    {
1293		PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint32_t, src_stride, src_line, 1);
1294
1295		for (i = 0; i < src_width; )
1296		{
1297		    for (j = 0; j < src_image->bits.width; j++, i++)
1298			extended_src[i] = src_line[j];
1299		}
1300	    }
1301	    else if (src_bpp == 16)
1302	    {
1303		uint16_t *src_line_16;
1304
1305		PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint16_t, src_stride,
1306				       src_line_16, 1);
1307		src_line = (uint32_t*)src_line_16;
1308
1309		for (i = 0; i < src_width; )
1310		{
1311		    for (j = 0; j < src_image->bits.width; j++, i++)
1312			((uint16_t*)extended_src)[i] = ((uint16_t*)src_line)[j];
1313		}
1314	    }
1315	    else if (src_bpp == 8)
1316	    {
1317		uint8_t *src_line_8;
1318
1319		PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint8_t, src_stride,
1320				       src_line_8, 1);
1321		src_line = (uint32_t*)src_line_8;
1322
1323		for (i = 0; i < src_width; )
1324		{
1325		    for (j = 0; j < src_image->bits.width; j++, i++)
1326			((uint8_t*)extended_src)[i] = ((uint8_t*)src_line)[j];
1327		}
1328	    }
1329
1330	    info2.src_y = 0;
1331	}
1332	else
1333	{
1334	    info2.src_y = sy;
1335	}
1336
1337	width_remain = width;
1338
1339	while (width_remain > 0)
1340	{
1341	    num_pixels = src_width - sx;
1342
1343	    if (num_pixels > width_remain)
1344		num_pixels = width_remain;
1345
1346	    info2.src_x = sx;
1347	    info2.width = num_pixels;
1348	    info2.height = 1;
1349
1350	    func (imp, &info2);
1351
1352	    width_remain -= num_pixels;
1353	    info2.mask_x += num_pixels;
1354	    info2.dest_x += num_pixels;
1355	    sx = 0;
1356	}
1357
1358	sx = src_x;
1359	sy++;
1360	info2.mask_x = info->mask_x;
1361	info2.mask_y++;
1362	info2.dest_x = info->dest_x;
1363	info2.dest_y++;
1364    }
1365
1366    if (need_src_extension)
1367	_pixman_image_fini (&extended_src_image);
1368}
1369
1370/* Use more unrolling for src_0565_0565 because it is typically CPU bound */
1371static force_inline void
1372scaled_nearest_scanline_565_565_SRC (uint16_t *       dst,
1373				     const uint16_t * src,
1374				     int32_t          w,
1375				     pixman_fixed_t   vx,
1376				     pixman_fixed_t   unit_x,
1377				     pixman_fixed_t   max_vx,
1378				     pixman_bool_t    fully_transparent_src)
1379{
1380    uint16_t tmp1, tmp2, tmp3, tmp4;
1381    while ((w -= 4) >= 0)
1382    {
1383	tmp1 = *(src + pixman_fixed_to_int (vx));
1384	vx += unit_x;
1385	tmp2 = *(src + pixman_fixed_to_int (vx));
1386	vx += unit_x;
1387	tmp3 = *(src + pixman_fixed_to_int (vx));
1388	vx += unit_x;
1389	tmp4 = *(src + pixman_fixed_to_int (vx));
1390	vx += unit_x;
1391	*dst++ = tmp1;
1392	*dst++ = tmp2;
1393	*dst++ = tmp3;
1394	*dst++ = tmp4;
1395    }
1396    if (w & 2)
1397    {
1398	tmp1 = *(src + pixman_fixed_to_int (vx));
1399	vx += unit_x;
1400	tmp2 = *(src + pixman_fixed_to_int (vx));
1401	vx += unit_x;
1402	*dst++ = tmp1;
1403	*dst++ = tmp2;
1404    }
1405    if (w & 1)
1406	*dst = *(src + pixman_fixed_to_int (vx));
1407}
1408
1409FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
1410		       scaled_nearest_scanline_565_565_SRC,
1411		       uint16_t, uint16_t, COVER)
1412FAST_NEAREST_MAINLOOP (565_565_none_SRC,
1413		       scaled_nearest_scanline_565_565_SRC,
1414		       uint16_t, uint16_t, NONE)
1415FAST_NEAREST_MAINLOOP (565_565_pad_SRC,
1416		       scaled_nearest_scanline_565_565_SRC,
1417		       uint16_t, uint16_t, PAD)
1418
1419static force_inline uint32_t
1420fetch_nearest (pixman_repeat_t src_repeat,
1421	       pixman_format_code_t format,
1422	       uint32_t *src, int x, int src_width)
1423{
1424    if (repeat (src_repeat, &x, src_width))
1425    {
1426	if (format == PIXMAN_x8r8g8b8 || format == PIXMAN_x8b8g8r8)
1427	    return *(src + x) | 0xff000000;
1428	else
1429	    return *(src + x);
1430    }
1431    else
1432    {
1433	return 0;
1434    }
1435}
1436
1437static force_inline void
1438combine_over (uint32_t s, uint32_t *dst)
1439{
1440    if (s)
1441    {
1442	uint8_t ia = 0xff - (s >> 24);
1443
1444	if (ia)
1445	    UN8x4_MUL_UN8_ADD_UN8x4 (*dst, ia, s);
1446	else
1447	    *dst = s;
1448    }
1449}
1450
1451static force_inline void
1452combine_src (uint32_t s, uint32_t *dst)
1453{
1454    *dst = s;
1455}
1456
1457static void
1458fast_composite_scaled_nearest (pixman_implementation_t *imp,
1459			       pixman_composite_info_t *info)
1460{
1461    PIXMAN_COMPOSITE_ARGS (info);
1462    uint32_t       *dst_line;
1463    uint32_t       *src_line;
1464    int             dst_stride, src_stride;
1465    int		    src_width, src_height;
1466    pixman_repeat_t src_repeat;
1467    pixman_fixed_t unit_x, unit_y;
1468    pixman_format_code_t src_format;
1469    pixman_vector_t v;
1470    pixman_fixed_t vy;
1471
1472    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1473    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be
1474     * transformed from destination space to source space
1475     */
1476    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src_line, 1);
1477
1478    /* reference point is the center of the pixel */
1479    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;
1480    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;
1481    v.vector[2] = pixman_fixed_1;
1482
1483    if (!pixman_transform_point_3d (src_image->common.transform, &v))
1484	return;
1485
1486    unit_x = src_image->common.transform->matrix[0][0];
1487    unit_y = src_image->common.transform->matrix[1][1];
1488
1489    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */
1490    v.vector[0] -= pixman_fixed_e;
1491    v.vector[1] -= pixman_fixed_e;
1492
1493    src_height = src_image->bits.height;
1494    src_width = src_image->bits.width;
1495    src_repeat = src_image->common.repeat;
1496    src_format = src_image->bits.format;
1497
1498    vy = v.vector[1];
1499    while (height--)
1500    {
1501        pixman_fixed_t vx = v.vector[0];
1502	int y = pixman_fixed_to_int (vy);
1503	uint32_t *dst = dst_line;
1504
1505	dst_line += dst_stride;
1506
1507        /* adjust the y location by a unit vector in the y direction
1508         * this is equivalent to transforming y+1 of the destination point to source space */
1509        vy += unit_y;
1510
1511	if (!repeat (src_repeat, &y, src_height))
1512	{
1513	    if (op == PIXMAN_OP_SRC)
1514		memset (dst, 0, sizeof (*dst) * width);
1515	}
1516	else
1517	{
1518	    int w = width;
1519
1520	    uint32_t *src = src_line + y * src_stride;
1521
1522	    while (w >= 2)
1523	    {
1524		uint32_t s1, s2;
1525		int x1, x2;
1526
1527		x1 = pixman_fixed_to_int (vx);
1528		vx += unit_x;
1529
1530		x2 = pixman_fixed_to_int (vx);
1531		vx += unit_x;
1532
1533		w -= 2;
1534
1535		s1 = fetch_nearest (src_repeat, src_format, src, x1, src_width);
1536		s2 = fetch_nearest (src_repeat, src_format, src, x2, src_width);
1537
1538		if (op == PIXMAN_OP_OVER)
1539		{
1540		    combine_over (s1, dst++);
1541		    combine_over (s2, dst++);
1542		}
1543		else
1544		{
1545		    combine_src (s1, dst++);
1546		    combine_src (s2, dst++);
1547		}
1548	    }
1549
1550	    while (w--)
1551	    {
1552		uint32_t s;
1553		int x;
1554
1555		x = pixman_fixed_to_int (vx);
1556		vx += unit_x;
1557
1558		s = fetch_nearest (src_repeat, src_format, src, x, src_width);
1559
1560		if (op == PIXMAN_OP_OVER)
1561		    combine_over (s, dst++);
1562		else
1563		    combine_src (s, dst++);
1564	    }
1565	}
1566    }
1567}
1568
1569#define CACHE_LINE_SIZE 64
1570
1571#define FAST_SIMPLE_ROTATE(suffix, pix_type)                                  \
1572                                                                              \
1573static void                                                                   \
1574blt_rotated_90_trivial_##suffix (pix_type       *dst,                         \
1575				 int             dst_stride,                  \
1576				 const pix_type *src,                         \
1577				 int             src_stride,                  \
1578				 int             w,                           \
1579				 int             h)                           \
1580{                                                                             \
1581    int x, y;                                                                 \
1582    for (y = 0; y < h; y++)                                                   \
1583    {                                                                         \
1584	const pix_type *s = src + (h - y - 1);                                \
1585	pix_type *d = dst + dst_stride * y;                                   \
1586	for (x = 0; x < w; x++)                                               \
1587	{                                                                     \
1588	    *d++ = *s;                                                        \
1589	    s += src_stride;                                                  \
1590	}                                                                     \
1591    }                                                                         \
1592}                                                                             \
1593                                                                              \
1594static void                                                                   \
1595blt_rotated_270_trivial_##suffix (pix_type       *dst,                        \
1596				  int             dst_stride,                 \
1597				  const pix_type *src,                        \
1598				  int             src_stride,                 \
1599				  int             w,                          \
1600				  int             h)                          \
1601{                                                                             \
1602    int x, y;                                                                 \
1603    for (y = 0; y < h; y++)                                                   \
1604    {                                                                         \
1605	const pix_type *s = src + src_stride * (w - 1) + y;                   \
1606	pix_type *d = dst + dst_stride * y;                                   \
1607	for (x = 0; x < w; x++)                                               \
1608	{                                                                     \
1609	    *d++ = *s;                                                        \
1610	    s -= src_stride;                                                  \
1611	}                                                                     \
1612    }                                                                         \
1613}                                                                             \
1614                                                                              \
1615static void                                                                   \
1616blt_rotated_90_##suffix (pix_type       *dst,                                 \
1617			 int             dst_stride,                          \
1618			 const pix_type *src,                                 \
1619			 int             src_stride,                          \
1620			 int             W,                                   \
1621			 int             H)                                   \
1622{                                                                             \
1623    int x;                                                                    \
1624    int leading_pixels = 0, trailing_pixels = 0;                              \
1625    const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
1626                                                                              \
1627    /*                                                                        \
1628     * split processing into handling destination as TILE_SIZExH cache line   \
1629     * aligned vertical stripes (optimistically assuming that destination     \
1630     * stride is a multiple of cache line, if not - it will be just a bit     \
1631     * slower)                                                                \
1632     */                                                                       \
1633                                                                              \
1634    if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
1635    {                                                                         \
1636	leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
1637			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
1638	if (leading_pixels > W)                                               \
1639	    leading_pixels = W;                                               \
1640                                                                              \
1641	/* unaligned leading part NxH (where N < TILE_SIZE) */                \
1642	blt_rotated_90_trivial_##suffix (                                     \
1643	    dst,                                                              \
1644	    dst_stride,                                                       \
1645	    src,                                                              \
1646	    src_stride,                                                       \
1647	    leading_pixels,                                                   \
1648	    H);                                                               \
1649	                                                                      \
1650	dst += leading_pixels;                                                \
1651	src += leading_pixels * src_stride;                                   \
1652	W -= leading_pixels;                                                  \
1653    }                                                                         \
1654                                                                              \
1655    if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
1656    {                                                                         \
1657	trailing_pixels = (((uintptr_t)(dst + W) &                            \
1658			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
1659	if (trailing_pixels > W)                                              \
1660	    trailing_pixels = W;                                              \
1661	W -= trailing_pixels;                                                 \
1662    }                                                                         \
1663                                                                              \
1664    for (x = 0; x < W; x += TILE_SIZE)                                        \
1665    {                                                                         \
1666	/* aligned middle part TILE_SIZExH */                                 \
1667	blt_rotated_90_trivial_##suffix (                                     \
1668	    dst + x,                                                          \
1669	    dst_stride,                                                       \
1670	    src + src_stride * x,                                             \
1671	    src_stride,                                                       \
1672	    TILE_SIZE,                                                        \
1673	    H);                                                               \
1674    }                                                                         \
1675                                                                              \
1676    if (trailing_pixels)                                                      \
1677    {                                                                         \
1678	/* unaligned trailing part NxH (where N < TILE_SIZE) */               \
1679	blt_rotated_90_trivial_##suffix (                                     \
1680	    dst + W,                                                          \
1681	    dst_stride,                                                       \
1682	    src + W * src_stride,                                             \
1683	    src_stride,                                                       \
1684	    trailing_pixels,                                                  \
1685	    H);                                                               \
1686    }                                                                         \
1687}                                                                             \
1688                                                                              \
1689static void                                                                   \
1690blt_rotated_270_##suffix (pix_type       *dst,                                \
1691			  int             dst_stride,                         \
1692			  const pix_type *src,                                \
1693			  int             src_stride,                         \
1694			  int             W,                                  \
1695			  int             H)                                  \
1696{                                                                             \
1697    int x;                                                                    \
1698    int leading_pixels = 0, trailing_pixels = 0;                              \
1699    const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
1700                                                                              \
1701    /*                                                                        \
1702     * split processing into handling destination as TILE_SIZExH cache line   \
1703     * aligned vertical stripes (optimistically assuming that destination     \
1704     * stride is a multiple of cache line, if not - it will be just a bit     \
1705     * slower)                                                                \
1706     */                                                                       \
1707                                                                              \
1708    if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
1709    {                                                                         \
1710	leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
1711			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
1712	if (leading_pixels > W)                                               \
1713	    leading_pixels = W;                                               \
1714                                                                              \
1715	/* unaligned leading part NxH (where N < TILE_SIZE) */                \
1716	blt_rotated_270_trivial_##suffix (                                    \
1717	    dst,                                                              \
1718	    dst_stride,                                                       \
1719	    src + src_stride * (W - leading_pixels),                          \
1720	    src_stride,                                                       \
1721	    leading_pixels,                                                   \
1722	    H);                                                               \
1723	                                                                      \
1724	dst += leading_pixels;                                                \
1725	W -= leading_pixels;                                                  \
1726    }                                                                         \
1727                                                                              \
1728    if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
1729    {                                                                         \
1730	trailing_pixels = (((uintptr_t)(dst + W) &                            \
1731			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
1732	if (trailing_pixels > W)                                              \
1733	    trailing_pixels = W;                                              \
1734	W -= trailing_pixels;                                                 \
1735	src += trailing_pixels * src_stride;                                  \
1736    }                                                                         \
1737                                                                              \
1738    for (x = 0; x < W; x += TILE_SIZE)                                        \
1739    {                                                                         \
1740	/* aligned middle part TILE_SIZExH */                                 \
1741	blt_rotated_270_trivial_##suffix (                                    \
1742	    dst + x,                                                          \
1743	    dst_stride,                                                       \
1744	    src + src_stride * (W - x - TILE_SIZE),                           \
1745	    src_stride,                                                       \
1746	    TILE_SIZE,                                                        \
1747	    H);                                                               \
1748    }                                                                         \
1749                                                                              \
1750    if (trailing_pixels)                                                      \
1751    {                                                                         \
1752	/* unaligned trailing part NxH (where N < TILE_SIZE) */               \
1753	blt_rotated_270_trivial_##suffix (                                    \
1754	    dst + W,                                                          \
1755	    dst_stride,                                                       \
1756	    src - trailing_pixels * src_stride,                               \
1757	    src_stride,                                                       \
1758	    trailing_pixels,                                                  \
1759	    H);                                                               \
1760    }                                                                         \
1761}                                                                             \
1762                                                                              \
1763static void                                                                   \
1764fast_composite_rotate_90_##suffix (pixman_implementation_t *imp,              \
1765				   pixman_composite_info_t *info)	      \
1766{									      \
1767    PIXMAN_COMPOSITE_ARGS (info);					      \
1768    pix_type       *dst_line;						      \
1769    pix_type       *src_line;                                                 \
1770    int             dst_stride, src_stride;                                   \
1771    int             src_x_t, src_y_t;                                         \
1772                                                                              \
1773    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, pix_type,              \
1774			   dst_stride, dst_line, 1);                          \
1775    src_x_t = -src_y + pixman_fixed_to_int (                                  \
1776				src_image->common.transform->matrix[0][2] +   \
1777				pixman_fixed_1 / 2 - pixman_fixed_e) - height;\
1778    src_y_t = src_x + pixman_fixed_to_int (                                   \
1779				src_image->common.transform->matrix[1][2] +   \
1780				pixman_fixed_1 / 2 - pixman_fixed_e);         \
1781    PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
1782			   src_stride, src_line, 1);                          \
1783    blt_rotated_90_##suffix (dst_line, dst_stride, src_line, src_stride,      \
1784			     width, height);                                  \
1785}                                                                             \
1786                                                                              \
1787static void                                                                   \
1788fast_composite_rotate_270_##suffix (pixman_implementation_t *imp,             \
1789				    pixman_composite_info_t *info)            \
1790{                                                                             \
1791    PIXMAN_COMPOSITE_ARGS (info);					      \
1792    pix_type       *dst_line;						      \
1793    pix_type       *src_line;                                                 \
1794    int             dst_stride, src_stride;                                   \
1795    int             src_x_t, src_y_t;                                         \
1796                                                                              \
1797    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, pix_type,              \
1798			   dst_stride, dst_line, 1);                          \
1799    src_x_t = src_y + pixman_fixed_to_int (                                   \
1800				src_image->common.transform->matrix[0][2] +   \
1801				pixman_fixed_1 / 2 - pixman_fixed_e);         \
1802    src_y_t = -src_x + pixman_fixed_to_int (                                  \
1803				src_image->common.transform->matrix[1][2] +   \
1804				pixman_fixed_1 / 2 - pixman_fixed_e) - width; \
1805    PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
1806			   src_stride, src_line, 1);                          \
1807    blt_rotated_270_##suffix (dst_line, dst_stride, src_line, src_stride,     \
1808			      width, height);                                 \
1809}
1810
1811FAST_SIMPLE_ROTATE (8, uint8_t)
1812FAST_SIMPLE_ROTATE (565, uint16_t)
1813FAST_SIMPLE_ROTATE (8888, uint32_t)
1814
1815static const pixman_fast_path_t c_fast_paths[] =
1816{
1817    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565),
1818    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, fast_composite_over_n_8_0565),
1819    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, fast_composite_over_n_8_0888),
1820    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, fast_composite_over_n_8_0888),
1821    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, fast_composite_over_n_8_8888),
1822    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, fast_composite_over_n_8_8888),
1823    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, fast_composite_over_n_8_8888),
1824    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, fast_composite_over_n_8_8888),
1825    PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, fast_composite_over_n_1_8888),
1826    PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, fast_composite_over_n_1_8888),
1827    PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, fast_composite_over_n_1_8888),
1828    PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, fast_composite_over_n_1_8888),
1829    PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5,   fast_composite_over_n_1_0565),
1830    PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5,   fast_composite_over_n_1_0565),
1831    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, fast_composite_over_n_8888_8888_ca),
1832    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, fast_composite_over_n_8888_8888_ca),
1833    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, fast_composite_over_n_8888_0565_ca),
1834    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, fast_composite_over_n_8888_8888_ca),
1835    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, fast_composite_over_n_8888_8888_ca),
1836    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, fast_composite_over_n_8888_0565_ca),
1837    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, fast_composite_over_x888_8_8888),
1838    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, fast_composite_over_x888_8_8888),
1839    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, fast_composite_over_x888_8_8888),
1840    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, fast_composite_over_x888_8_8888),
1841    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, fast_composite_over_8888_8888),
1842    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, fast_composite_over_8888_8888),
1843    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, fast_composite_over_8888_0565),
1844    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888),
1845    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888),
1846    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565),
1847    PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, fast_composite_add_0565_0565),
1848    PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, fast_composite_add_0565_0565),
1849    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888),
1850    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888),
1851    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8),
1852    PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1_1),
1853    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca),
1854    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8),
1855    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill),
1856    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill),
1857    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill),
1858    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill),
1859    PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill),
1860    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill),
1861    PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill),
1862    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888),
1863    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, fast_composite_src_x888_8888),
1864    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
1865    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, fast_composite_src_memcpy),
1866    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
1867    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
1868    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, fast_composite_src_memcpy),
1869    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
1870    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, fast_composite_src_memcpy),
1871    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, fast_composite_src_memcpy),
1872    PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, fast_composite_src_memcpy),
1873    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, fast_composite_src_memcpy),
1874    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, fast_composite_src_memcpy),
1875    PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, fast_composite_src_memcpy),
1876    PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, fast_composite_src_memcpy),
1877    PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
1878    PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
1879    PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy),
1880    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8),
1881    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8),
1882
1883    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888),
1884    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888),
1885    SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888),
1886    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888),
1887
1888    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888_8888),
1889    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, 8888_8888),
1890
1891    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, 8888_565),
1892    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, 8888_565),
1893
1894    SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565),
1895
1896    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
1897    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
1898    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
1899    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
1900    SIMPLE_NEAREST_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
1901    SIMPLE_NEAREST_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
1902
1903    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, 8888_8888),
1904    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, 8888_8888),
1905    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888),
1906    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, 8888_8888),
1907
1908    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565),
1909
1910#define NEAREST_FAST_PATH(op,s,d)		\
1911    {   PIXMAN_OP_ ## op,			\
1912	PIXMAN_ ## s, SCALED_NEAREST_FLAGS,	\
1913	PIXMAN_null, 0,				\
1914	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,	\
1915	fast_composite_scaled_nearest,		\
1916    }
1917
1918    NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8),
1919    NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8),
1920    NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8),
1921    NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8),
1922
1923    NEAREST_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8),
1924    NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8),
1925    NEAREST_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8),
1926    NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8),
1927
1928    NEAREST_FAST_PATH (OVER, x8r8g8b8, x8r8g8b8),
1929    NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8),
1930    NEAREST_FAST_PATH (OVER, x8b8g8r8, x8b8g8r8),
1931    NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8),
1932
1933    NEAREST_FAST_PATH (OVER, x8r8g8b8, a8r8g8b8),
1934    NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8),
1935    NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8),
1936    NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8),
1937
1938#define SIMPLE_ROTATE_FLAGS(angle)					  \
1939    (FAST_PATH_ROTATE_ ## angle ## _TRANSFORM	|			  \
1940     FAST_PATH_NEAREST_FILTER			|			  \
1941     FAST_PATH_SAMPLES_COVER_CLIP_NEAREST	|			  \
1942     FAST_PATH_STANDARD_FLAGS)
1943
1944#define SIMPLE_ROTATE_FAST_PATH(op,s,d,suffix)				  \
1945    {   PIXMAN_OP_ ## op,						  \
1946	PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (90),				  \
1947	PIXMAN_null, 0,							  \
1948	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				  \
1949	fast_composite_rotate_90_##suffix,				  \
1950    },									  \
1951    {   PIXMAN_OP_ ## op,						  \
1952	PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (270),			  \
1953	PIXMAN_null, 0,							  \
1954	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				  \
1955	fast_composite_rotate_270_##suffix,				  \
1956    }
1957
1958    SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888),
1959    SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888),
1960    SIMPLE_ROTATE_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888),
1961    SIMPLE_ROTATE_FAST_PATH (SRC, r5g6b5, r5g6b5, 565),
1962    SIMPLE_ROTATE_FAST_PATH (SRC, a8, a8, 8),
1963
1964    /* Simple repeat fast path entry. */
1965    {	PIXMAN_OP_any,
1966	PIXMAN_any,
1967	(FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | FAST_PATH_BITS_IMAGE |
1968	 FAST_PATH_NORMAL_REPEAT),
1969	PIXMAN_any, 0,
1970	PIXMAN_any, FAST_PATH_STD_DEST_FLAGS,
1971	fast_composite_tiled_repeat
1972    },
1973
1974    {   PIXMAN_OP_NONE	},
1975};
1976
1977#ifdef WORDS_BIGENDIAN
1978#define A1_FILL_MASK(n, offs) (((1U << (n)) - 1) << (32 - (offs) - (n)))
1979#else
1980#define A1_FILL_MASK(n, offs) (((1U << (n)) - 1) << (offs))
1981#endif
1982
1983static force_inline void
1984pixman_fill1_line (uint32_t *dst, int offs, int width, int v)
1985{
1986    if (offs)
1987    {
1988	int leading_pixels = 32 - offs;
1989	if (leading_pixels >= width)
1990	{
1991	    if (v)
1992		*dst |= A1_FILL_MASK (width, offs);
1993	    else
1994		*dst &= ~A1_FILL_MASK (width, offs);
1995	    return;
1996	}
1997	else
1998	{
1999	    if (v)
2000		*dst++ |= A1_FILL_MASK (leading_pixels, offs);
2001	    else
2002		*dst++ &= ~A1_FILL_MASK (leading_pixels, offs);
2003	    width -= leading_pixels;
2004	}
2005    }
2006    while (width >= 32)
2007    {
2008	if (v)
2009	    *dst++ = 0xFFFFFFFF;
2010	else
2011	    *dst++ = 0;
2012	width -= 32;
2013    }
2014    if (width > 0)
2015    {
2016	if (v)
2017	    *dst |= A1_FILL_MASK (width, 0);
2018	else
2019	    *dst &= ~A1_FILL_MASK (width, 0);
2020    }
2021}
2022
2023static void
2024pixman_fill1 (uint32_t *bits,
2025              int       stride,
2026              int       x,
2027              int       y,
2028              int       width,
2029              int       height,
2030              uint32_t  filler)
2031{
2032    uint32_t *dst = bits + y * stride + (x >> 5);
2033    int offs = x & 31;
2034
2035    if (filler & 1)
2036    {
2037	while (height--)
2038	{
2039	    pixman_fill1_line (dst, offs, width, 1);
2040	    dst += stride;
2041	}
2042    }
2043    else
2044    {
2045	while (height--)
2046	{
2047	    pixman_fill1_line (dst, offs, width, 0);
2048	    dst += stride;
2049	}
2050    }
2051}
2052
2053static void
2054pixman_fill8 (uint32_t *bits,
2055              int       stride,
2056              int       x,
2057              int       y,
2058              int       width,
2059              int       height,
2060              uint32_t  filler)
2061{
2062    int byte_stride = stride * (int) sizeof (uint32_t);
2063    uint8_t *dst = (uint8_t *) bits;
2064    uint8_t v = filler & 0xff;
2065    int i;
2066
2067    dst = dst + y * byte_stride + x;
2068
2069    while (height--)
2070    {
2071	for (i = 0; i < width; ++i)
2072	    dst[i] = v;
2073
2074	dst += byte_stride;
2075    }
2076}
2077
2078static void
2079pixman_fill16 (uint32_t *bits,
2080               int       stride,
2081               int       x,
2082               int       y,
2083               int       width,
2084               int       height,
2085               uint32_t  filler)
2086{
2087    int short_stride =
2088	(stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t);
2089    uint16_t *dst = (uint16_t *)bits;
2090    uint16_t v = filler & 0xffff;
2091    int i;
2092
2093    dst = dst + y * short_stride + x;
2094
2095    while (height--)
2096    {
2097	for (i = 0; i < width; ++i)
2098	    dst[i] = v;
2099
2100	dst += short_stride;
2101    }
2102}
2103
2104static void
2105pixman_fill32 (uint32_t *bits,
2106               int       stride,
2107               int       x,
2108               int       y,
2109               int       width,
2110               int       height,
2111               uint32_t  filler)
2112{
2113    int i;
2114
2115    bits = bits + y * stride + x;
2116
2117    while (height--)
2118    {
2119	for (i = 0; i < width; ++i)
2120	    bits[i] = filler;
2121
2122	bits += stride;
2123    }
2124}
2125
2126static pixman_bool_t
2127fast_path_fill (pixman_implementation_t *imp,
2128                uint32_t *               bits,
2129                int                      stride,
2130                int                      bpp,
2131                int                      x,
2132                int                      y,
2133                int                      width,
2134                int                      height,
2135                uint32_t		 filler)
2136{
2137    switch (bpp)
2138    {
2139    case 1:
2140	pixman_fill1 (bits, stride, x, y, width, height, filler);
2141	break;
2142
2143    case 8:
2144	pixman_fill8 (bits, stride, x, y, width, height, filler);
2145	break;
2146
2147    case 16:
2148	pixman_fill16 (bits, stride, x, y, width, height, filler);
2149	break;
2150
2151    case 32:
2152	pixman_fill32 (bits, stride, x, y, width, height, filler);
2153	break;
2154
2155    default:
2156	return FALSE;
2157    }
2158
2159    return TRUE;
2160}
2161
2162/*****************************************************************************/
2163
2164static uint32_t *
2165fast_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
2166{
2167    int32_t w = iter->width;
2168    uint32_t *dst = iter->buffer;
2169    const uint16_t *src = (const uint16_t *)iter->bits;
2170
2171    iter->bits += iter->stride;
2172
2173    /* Align the source buffer at 4 bytes boundary */
2174    if (w > 0 && ((uintptr_t)src & 3))
2175    {
2176	*dst++ = convert_0565_to_8888 (*src++);
2177	w--;
2178    }
2179    /* Process two pixels per iteration */
2180    while ((w -= 2) >= 0)
2181    {
2182	uint32_t sr, sb, sg, t0, t1;
2183	uint32_t s = *(const uint32_t *)src;
2184	src += 2;
2185	sr = (s >> 8) & 0x00F800F8;
2186	sb = (s << 3) & 0x00F800F8;
2187	sg = (s >> 3) & 0x00FC00FC;
2188	sr |= sr >> 5;
2189	sb |= sb >> 5;
2190	sg |= sg >> 6;
2191	t0 = ((sr << 16) & 0x00FF0000) | ((sg << 8) & 0x0000FF00) |
2192	     (sb & 0xFF) | 0xFF000000;
2193	t1 = (sr & 0x00FF0000) | ((sg >> 8) & 0x0000FF00) |
2194	     (sb >> 16) | 0xFF000000;
2195#ifdef WORDS_BIGENDIAN
2196	*dst++ = t1;
2197	*dst++ = t0;
2198#else
2199	*dst++ = t0;
2200	*dst++ = t1;
2201#endif
2202    }
2203    if (w & 1)
2204    {
2205	*dst = convert_0565_to_8888 (*src);
2206    }
2207
2208    return iter->buffer;
2209}
2210
2211static uint32_t *
2212fast_dest_fetch_noop (pixman_iter_t *iter, const uint32_t *mask)
2213{
2214    iter->bits += iter->stride;
2215    return iter->buffer;
2216}
2217
2218/* Helper function for a workaround, which tries to ensure that 0x1F001F
2219 * constant is always allocated in a register on RISC architectures.
2220 */
2221static force_inline uint32_t
2222convert_8888_to_0565_workaround (uint32_t s, uint32_t x1F001F)
2223{
2224    uint32_t a, b;
2225    a = (s >> 3) & x1F001F;
2226    b = s & 0xFC00;
2227    a |= a >> 5;
2228    a |= b >> 5;
2229    return a;
2230}
2231
2232static void
2233fast_write_back_r5g6b5 (pixman_iter_t *iter)
2234{
2235    int32_t w = iter->width;
2236    uint16_t *dst = (uint16_t *)(iter->bits - iter->stride);
2237    const uint32_t *src = iter->buffer;
2238    /* Workaround to ensure that x1F001F variable is allocated in a register */
2239    static volatile uint32_t volatile_x1F001F = 0x1F001F;
2240    uint32_t x1F001F = volatile_x1F001F;
2241
2242    while ((w -= 4) >= 0)
2243    {
2244	uint32_t s1 = *src++;
2245	uint32_t s2 = *src++;
2246	uint32_t s3 = *src++;
2247	uint32_t s4 = *src++;
2248	*dst++ = convert_8888_to_0565_workaround (s1, x1F001F);
2249	*dst++ = convert_8888_to_0565_workaround (s2, x1F001F);
2250	*dst++ = convert_8888_to_0565_workaround (s3, x1F001F);
2251	*dst++ = convert_8888_to_0565_workaround (s4, x1F001F);
2252    }
2253    if (w & 2)
2254    {
2255	*dst++ = convert_8888_to_0565_workaround (*src++, x1F001F);
2256	*dst++ = convert_8888_to_0565_workaround (*src++, x1F001F);
2257    }
2258    if (w & 1)
2259    {
2260	*dst = convert_8888_to_0565_workaround (*src, x1F001F);
2261    }
2262}
2263
2264typedef struct
2265{
2266    pixman_format_code_t	format;
2267    pixman_iter_get_scanline_t	get_scanline;
2268    pixman_iter_write_back_t	write_back;
2269} fetcher_info_t;
2270
2271static const fetcher_info_t fetchers[] =
2272{
2273    { PIXMAN_r5g6b5, fast_fetch_r5g6b5, fast_write_back_r5g6b5 },
2274    { PIXMAN_null }
2275};
2276
2277static pixman_bool_t
2278fast_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
2279{
2280    pixman_image_t *image = iter->image;
2281
2282#define FLAGS								\
2283    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
2284     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
2285
2286    if ((iter->iter_flags & ITER_NARROW)			&&
2287	(iter->image_flags & FLAGS) == FLAGS)
2288    {
2289	const fetcher_info_t *f;
2290
2291	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
2292	{
2293	    if (image->common.extended_format_code == f->format)
2294	    {
2295		uint8_t *b = (uint8_t *)image->bits.bits;
2296		int s = image->bits.rowstride * 4;
2297
2298		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
2299		iter->stride = s;
2300
2301		iter->get_scanline = f->get_scanline;
2302		return TRUE;
2303	    }
2304	}
2305    }
2306
2307    return FALSE;
2308}
2309
2310static pixman_bool_t
2311fast_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
2312{
2313    pixman_image_t *image = iter->image;
2314
2315    if ((iter->iter_flags & ITER_NARROW)		&&
2316	(iter->image_flags & FAST_PATH_STD_DEST_FLAGS) == FAST_PATH_STD_DEST_FLAGS)
2317    {
2318	const fetcher_info_t *f;
2319
2320	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
2321	{
2322	    if (image->common.extended_format_code == f->format)
2323	    {
2324		uint8_t *b = (uint8_t *)image->bits.bits;
2325		int s = image->bits.rowstride * 4;
2326
2327		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
2328		iter->stride = s;
2329
2330		if ((iter->iter_flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) ==
2331		    (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA))
2332		{
2333		    iter->get_scanline = fast_dest_fetch_noop;
2334		}
2335		else
2336		{
2337		    iter->get_scanline = f->get_scanline;
2338		}
2339		iter->write_back = f->write_back;
2340		return TRUE;
2341	    }
2342	}
2343    }
2344    return FALSE;
2345}
2346
2347
2348pixman_implementation_t *
2349_pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
2350{
2351    pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths);
2352
2353    imp->fill = fast_path_fill;
2354    imp->src_iter_init = fast_src_iter_init;
2355    imp->dest_iter_init = fast_dest_iter_init;
2356
2357    return imp;
2358}
2359