read_rgba_span_x86.S revision 7d39c1ae76cc7dc6793980fd83db100399ee9179
1/*
2 * (C) Copyright IBM Corporation 2004
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25/**
26 * \file read_rgba_span_x86.S
27 * Optimized routines to transfer pixel data from the framebuffer to a
28 * buffer in main memory.
29 *
30 * \author Ian Romanick <idr@us.ibm.com>
31 */
32
33	.file	"read_rgba_span_x86.S"
34#if !defined(__DJGPP__) && !defined(__MINGW32__) /* this one cries for assyntax.h */
35/* Kevin F. Quinn 2nd July 2006
36 * Replace data segment constants with text-segment instructions
37	.section	.rodata
38	.align 16
39	.type	mask, @object
40	.size	mask, 32
41mask:
42	.long	0xff00ff00
43	.long	0xff00ff00
44	.long	0xff00ff00
45	.long	0xff00ff00
46	.long	0x00ff0000
47	.long	0x00ff0000
48	.long	0x00ff0000
49	.long	0x00ff0000
50 */
51#define	LOAD_MASK(mvins,m1,m2) \
52   	pushl	$0xff00ff00 ;\
53   	pushl	$0xff00ff00 ;\
54   	pushl	$0xff00ff00 ;\
55   	pushl	$0xff00ff00 ;\
56	mvins	(%esp), m1	;\
57   	pushl	$0x00ff0000 ;\
58   	pushl	$0x00ff0000 ;\
59   	pushl	$0x00ff0000 ;\
60   	pushl	$0x00ff0000 ;\
61	mvins	(%esp), m2	;\
62	addl	$32, %esp
63
64
65/* I implemented these as macros because the appear in quite a few places,
66 * and I've tweaked them a number of times.  I got tired of changing every
67 * place they appear. :)
68 */
69
70#define DO_ONE_PIXEL() \
71	movl	(%ebx), %eax ; \
72	addl	$4, %ebx ; \
73	bswap	%eax          /* ARGB -> BGRA */ ; \
74	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
75	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
76	addl	$4, %ecx
77
78#define DO_ONE_LAST_PIXEL() \
79	movl	(%ebx), %eax ; \
80	bswap	%eax          /* ARGB -> BGRA */ ; \
81	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
82	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
83
84
85/**
86 * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
87 *
88 * \warning
89 * This function assumes that the caller will issue the EMMS instruction
90 * at the correct places.
91 */
92
93.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
94.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
95	.type	_generic_read_RGBA_span_BGRA8888_REV_MMX, @function
96_generic_read_RGBA_span_BGRA8888_REV_MMX:
97	pushl	%ebx
98
99#ifdef USE_INNER_EMMS
100	emms
101#endif
102/* Kevin F. Quinn 2nd July 2006
103 * Replace data segment constants with text-segment instructions
104	movq	mask, %mm1
105	movq	mask+16, %mm2
106 */
107	LOAD_MASK(movq,%mm1,%mm2)
108
109	movl	8(%esp), %ebx	/* source pointer */
110	movl	16(%esp), %edx	/* number of pixels to copy */
111	movl	12(%esp), %ecx	/* destination pointer */
112
113	testl	%edx, %edx
114	jle	.L20		/* Bail if there's nothing to do. */
115
116	movl	%ebx, %eax
117
118	negl	%eax
119	sarl	$2, %eax
120	andl	$1, %eax
121	je	.L17
122
123	subl	%eax, %edx
124	DO_ONE_PIXEL()
125.L17:
126
127	/* Would it be faster to unroll this loop once and process 4 pixels
128	 * per pass, instead of just two?
129	 */
130
131	movl	%edx, %eax
132	shrl	%eax
133	jmp	.L18
134.L19:
135	movq	(%ebx), %mm0
136	addl	$8, %ebx
137
138	/* These 9 instructions do what PSHUFB (if there were such an
139	 * instruction) could do in 1. :(
140	 */
141
142	movq	%mm0, %mm3
143	movq	%mm0, %mm4
144
145	pand	%mm2, %mm3
146	psllq	$16, %mm4
147	psrlq	$16, %mm3
148	pand	%mm2, %mm4
149
150	pand	%mm1, %mm0
151	por	%mm4, %mm3
152	por	%mm3, %mm0
153
154	movq	%mm0, (%ecx)
155	addl	$8, %ecx
156	subl	$1, %eax
157.L18:
158	jne	.L19
159
160#ifdef USE_INNER_EMMS
161	emms
162#endif
163
164	/* At this point there are either 1 or 0 pixels remaining to be
165	 * converted.  Convert the last pixel, if needed.
166	 */
167
168	testl	$1, %edx
169	je	.L20
170
171	DO_ONE_LAST_PIXEL()
172
173.L20:
174	popl	%ebx
175	ret
176	.size	_generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
177
178
179/**
180 * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
181 * instructions are only actually used to read data from the framebuffer.
182 * In practice, the speed-up is pretty small.
183 *
184 * \todo
185 * Do some more testing and determine if there's any reason to have this
186 * function in addition to the MMX version.
187 *
188 * \warning
189 * This function assumes that the caller will issue the EMMS instruction
190 * at the correct places.
191 */
192
193.globl _generic_read_RGBA_span_BGRA8888_REV_SSE
194.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
195	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE, @function
196_generic_read_RGBA_span_BGRA8888_REV_SSE:
197	pushl	%esi
198	pushl	%ebx
199	pushl	%ebp
200
201#ifdef USE_INNER_EMMS
202	emms
203#endif
204/* Kevin F. Quinn 2nd July 2006
205 * Replace data segment constants with text-segment instructions
206	movq	mask, %mm1
207	movq	mask+16, %mm2
208 */
209	LOAD_MASK(movq,%mm1,%mm2)
210
211	movl	16(%esp), %ebx	/* source pointer */
212	movl	24(%esp), %edx	/* number of pixels to copy */
213	movl	20(%esp), %ecx	/* destination pointer */
214
215	testl	%edx, %edx
216	jle	.L35		/* Bail if there's nothing to do. */
217
218	movl	%esp, %ebp
219	subl	$16, %esp
220	andl	$0xfffffff0, %esp
221
222	movl	%ebx, %eax
223	movl	%edx, %esi
224
225	negl	%eax
226	andl	$15, %eax
227	sarl	$2, %eax
228	cmpl	%edx, %eax
229	cmovle	%eax, %esi
230
231	subl	%esi, %edx
232
233	testl	$1, %esi
234	je	.L32
235
236	DO_ONE_PIXEL()
237.L32:
238
239	testl	$2, %esi
240	je	.L31
241
242	movq	(%ebx), %mm0
243	addl	$8, %ebx
244
245	movq	%mm0, %mm3
246	movq	%mm0, %mm4
247
248	pand	%mm2, %mm3
249	psllq	$16, %mm4
250	psrlq	$16, %mm3
251	pand	%mm2, %mm4
252
253	pand	%mm1, %mm0
254	por	%mm4, %mm3
255	por	%mm3, %mm0
256
257	movq	%mm0, (%ecx)
258	addl	$8, %ecx
259.L31:
260
261	movl	%edx, %eax
262	shrl	$2, %eax
263	jmp	.L33
264.L34:
265	movaps	(%ebx), %xmm0
266	addl	$16, %ebx
267
268	/* This would be so much better if we could just move directly from
269	 * an SSE register to an MMX register.  Unfortunately, that
270	 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
271	 * instruction.
272	 */
273
274	movaps	%xmm0, (%esp)
275	movq	(%esp), %mm0
276	movq	8(%esp), %mm5
277
278	movq	%mm0, %mm3
279	movq	%mm0, %mm4
280	movq	%mm5, %mm6
281	movq	%mm5, %mm7
282
283	pand	%mm2, %mm3
284	pand	%mm2, %mm6
285
286	psllq	$16, %mm4
287	psllq	$16, %mm7
288
289	psrlq	$16, %mm3
290	psrlq	$16, %mm6
291
292	pand	%mm2, %mm4
293	pand	%mm2, %mm7
294
295	pand	%mm1, %mm0
296	pand	%mm1, %mm5
297
298	por	%mm4, %mm3
299	por	%mm7, %mm6
300
301	por	%mm3, %mm0
302	por	%mm6, %mm5
303
304	movq	%mm0, (%ecx)
305	movq	%mm5, 8(%ecx)
306	addl	$16, %ecx
307
308	subl	$1, %eax
309.L33:
310	jne	.L34
311
312#ifdef USE_INNER_EMMS
313	emms
314#endif
315	movl	%ebp, %esp
316
317	/* At this point there are either [0, 3] pixels remaining to be
318	 * converted.
319	 */
320
321	testl	$2, %edx
322	je	.L36
323
324	movq	(%ebx), %mm0
325	addl	$8, %ebx
326
327	movq	%mm0, %mm3
328	movq	%mm0, %mm4
329
330	pand	%mm2, %mm3
331	psllq	$16, %mm4
332	psrlq	$16, %mm3
333	pand	%mm2, %mm4
334
335	pand	%mm1, %mm0
336	por	%mm4, %mm3
337	por	%mm3, %mm0
338
339	movq	%mm0, (%ecx)
340	addl	$8, %ecx
341.L36:
342
343	testl	$1, %edx
344	je	.L35
345
346	DO_ONE_LAST_PIXEL()
347.L35:
348	popl	%ebp
349	popl	%ebx
350	popl	%esi
351	ret
352	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
353
354
355/**
356 * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
357 */
358
359	.text
360.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
361.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
362	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
363_generic_read_RGBA_span_BGRA8888_REV_SSE2:
364	pushl	%esi
365	pushl	%ebx
366
367/* Kevin F. Quinn 2nd July 2006
368 * Replace data segment constants with text-segment instructions
369	movdqa	mask, %xmm1
370	movdqa	mask+16, %xmm2
371 */
372	LOAD_MASK(movdqa,%xmm1,%xmm2)
373
374	movl	12(%esp), %ebx	/* source pointer */
375	movl	20(%esp), %edx	/* number of pixels to copy */
376	movl	16(%esp), %ecx	/* destination pointer */
377
378	movl	%ebx, %eax
379	movl	%edx, %esi
380
381	testl	%edx, %edx
382	jle	.L46		/* Bail if there's nothing to do. */
383
384	/* If the source pointer isn't a multiple of 16 we have to process
385	 * a few pixels the "slow" way to get the address aligned for
386	 * the SSE fetch intsructions.
387	 */
388
389	negl	%eax
390	andl	$15, %eax
391	sarl	$2, %eax
392
393	cmpl	%edx, %eax
394	cmovbe	%eax, %esi
395	subl	%esi, %edx
396
397	testl	$1, %esi
398	je	.L41
399
400	DO_ONE_PIXEL()
401.L41:
402	testl	$2, %esi
403	je	.L40
404
405	movq	(%ebx), %xmm0
406	addl	$8, %ebx
407
408	movdqa	%xmm0, %xmm3
409	movdqa	%xmm0, %xmm4
410	andps	%xmm1, %xmm0
411
412	andps	%xmm2, %xmm3
413	pslldq	$2, %xmm4
414	psrldq	$2, %xmm3
415	andps	%xmm2, %xmm4
416
417	orps	%xmm4, %xmm3
418	orps	%xmm3, %xmm0
419
420	movq	%xmm0, (%ecx)
421	addl	$8, %ecx
422.L40:
423
424	/* Would it be worth having a specialized version of this loop for
425	 * the case where the destination is 16-byte aligned?  That version
426	 * would be identical except that it could use movedqa instead of
427	 * movdqu.
428	 */
429
430	movl	%edx, %eax
431	shrl	$2, %eax
432	jmp	.L42
433.L43:
434	movdqa	(%ebx), %xmm0
435	addl	$16, %ebx
436
437	movdqa	%xmm0, %xmm3
438	movdqa	%xmm0, %xmm4
439	andps	%xmm1, %xmm0
440
441	andps	%xmm2, %xmm3
442	pslldq	$2, %xmm4
443	psrldq	$2, %xmm3
444	andps	%xmm2, %xmm4
445
446	orps	%xmm4, %xmm3
447	orps	%xmm3, %xmm0
448
449	movdqu	%xmm0, (%ecx)
450	addl	$16, %ecx
451	subl	$1, %eax
452.L42:
453	jne	.L43
454
455
456	/* There may be upto 3 pixels remaining to be copied.  Take care
457	 * of them now.  We do the 2 pixel case first because the data
458	 * will be aligned.
459	 */
460
461	testl	$2, %edx
462	je	.L47
463
464	movq	(%ebx), %xmm0
465
466	movdqa	%xmm0, %xmm3
467	movdqa	%xmm0, %xmm4
468	andps	%xmm1, %xmm0
469
470	andps	%xmm2, %xmm3
471	pslldq	$2, %xmm4
472	psrldq	$2, %xmm3
473	andps	%xmm2, %xmm4
474
475	orps	%xmm4, %xmm3
476	orps	%xmm3, %xmm0
477
478	movq	%xmm0, (%ecx)
479.L47:
480
481	testl	$1, %edx
482	je	.L46
483
484	DO_ONE_LAST_PIXEL()
485.L46:
486
487	popl	%ebx
488	popl	%esi
489	ret
490	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
491
492
493
494/* Kevin F. Quinn 2nd July 2006
495 * Replace data segment constants with text-segment instructions
496 */
497#if 0
498	.section	.rodata
499
500	.align	16
501mask_565:
502	.word	0xf800
503	.word	0x07e0
504	.word	0x001f
505	.word	0x0000
506
507/* Setting SCALE_ADJUST to 5 gives a perfect match with the classic C
508 * implementation in Mesa.  Setting SCALE_ADJUST to 0 is slightly faster but
509 * at a small cost to accuracy.
510 */
511
512#define SCALE_ADJUST	5
513#if SCALE_ADJUST == 5
514prescale:
515	.word	0x0001
516	.word	0x0010
517	.word	0x0200
518	.word	0x0000
519
520scale:
521	.word	0x20e8		/* (0x00ff0000 / 0x000007c0) + 1 */
522	.word	0x40c5		/* (0x00ff0000 / 0x000003f0) + 1 */
523	.word	0x839d		/* (0x00ff0000 / 0x000001f0) + 1 */
524	.word	0x0000
525#elif SCALE_ADJUST == 0
526prescale:
527	.word	0x0001
528	.word	0x0020
529	.word	0x0800
530	.word	0x0000
531
532scale:
533	.word	0x0108		/* (0x00ff0000 / 0x0000f800) + 1 */
534	.word	0x0104		/* (0x00ff0000 / 0x0000fc00) + 1 */
535	.word	0x0108		/* (0x00ff0000 / 0x0000f800) + 1 */
536	.word	0x0000
537#else
538#error SCALE_ADJUST must either be 5 or 0.
539#endif
540
541
542alpha:	.long	0x00000000
543	.long	0x00ff0000
544#endif
545
546#define MASK_565_L	0x07e0f800
547#define MASK_565_H	0x0000001f
548#define SCALE_ADJUST	5
549#if SCALE_ADJUST == 5
550#define PRESCALE_L 0x00100001
551#define PRESCALE_H 0x00000200
552#define SCALE_L 0x40C620E8
553#define SCALE_H 0x0000839d
554#elif SCALE_ADJUST == 0
555#define PRESCALE_L 0x00200001
556#define PRESCALE_H 0x00000800
557#define SCALE_L 0x01040108
558#define SCALE_H 0x00000108
559#else
560#error SCALE_ADJUST must either be 5 or 0.
561#endif
562#define ALPHA_L 0x00000000
563#define ALPHA_H 0x00ff0000
564
565/**
566 * MMX optimized version of the RGB565 to RGBA copy routine.
567 */
568
569	.text
570	.globl	_generic_read_RGBA_span_RGB565_MMX
571        .hidden _generic_read_RGBA_span_RGB565_MMX
572	.type	_generic_read_RGBA_span_RGB565_MMX, @function
573
574_generic_read_RGBA_span_RGB565_MMX:
575
576#ifdef USE_INNER_EMMS
577	emms
578#endif
579
580	movl	4(%esp), %eax	/* source pointer */
581	movl	8(%esp), %edx	/* destination pointer */
582	movl	12(%esp), %ecx	/* number of pixels to copy */
583
584/* Kevin F. Quinn 2nd July 2006
585 * Replace data segment constants with text-segment instructions
586	movq	mask_565, %mm5
587	movq	prescale, %mm6
588	movq	scale, %mm7
589 */
590 	pushl	MASK_565_H
591 	pushl	MASK_565_L
592	movq	(%esp), %mm5
593 	pushl	PRESCALE_H
594 	pushl	PRESCALE_L
595	movq	(%esp), %mm6
596 	pushl	SCALE_H
597 	pushl	SCALE_L
598	movq	(%esp), %mm7
599 	pushl	ALPHA_H
600 	pushl	ALPHA_L
601	movq	(%esp), %mm3
602	addl	$32,%esp
603
604	sarl	$2, %ecx
605	jle	.L01		/* Bail early if the count is negative. */
606	jmp	.L02
607
608.L03:
609	/* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
610	 * second pixels into the four words of %mm0 and %mm2.
611      	 */
612
613	movq	(%eax), %mm4
614	addl	$8, %eax
615
616	pshufw	$0x00, %mm4, %mm0
617	pshufw	$0x55, %mm4, %mm2
618
619
620	/* Mask the pixels so that each word of each register contains only
621	 * one color component.
622	 */
623
624	pand	%mm5, %mm0
625	pand	%mm5, %mm2
626
627
628	/* Adjust the component values so that they are as small as possible,
629	 * but large enough so that we can multiply them by an unsigned 16-bit
630	 * number and get a value as large as 0x00ff0000.
631 	 */
632
633	pmullw	%mm6, %mm0
634	pmullw	%mm6, %mm2
635#if SCALE_ADJUST > 0
636	psrlw	$SCALE_ADJUST, %mm0
637	psrlw	$SCALE_ADJUST, %mm2
638#endif
639
640	/* Scale the input component values to be on the range
641	 * [0, 0x00ff0000].  This it the real magic of the whole routine.
642	 */
643
644	pmulhuw	%mm7, %mm0
645	pmulhuw	%mm7, %mm2
646
647
648	/* Always set the alpha value to 0xff.
649	 */
650
651/* Kevin F. Quinn 2nd July 2006
652 * Replace data segment constants with text-segment instructions
653	por	alpha, %mm0
654	por	alpha, %mm2
655 */
656 	por %mm3, %mm0
657 	por %mm3, %mm2
658
659
660	/* Pack the 16-bit values to 8-bit values and store the converted
661	 * pixel data.
662	 */
663
664	packuswb	%mm2, %mm0
665	movq	%mm0, (%edx)
666	addl	$8, %edx
667
668
669
670	pshufw	$0xaa, %mm4, %mm0
671	pshufw	$0xff, %mm4, %mm2
672
673	pand	%mm5, %mm0
674	pand	%mm5, %mm2
675	pmullw	%mm6, %mm0
676	pmullw	%mm6, %mm2
677#if SCALE_ADJUST > 0
678	psrlw	$SCALE_ADJUST, %mm0
679	psrlw	$SCALE_ADJUST, %mm2
680#endif
681	pmulhuw	%mm7, %mm0
682	pmulhuw	%mm7, %mm2
683
684/* Kevin F. Quinn 2nd July 2006
685 * Replace data segment constants with text-segment instructions
686	por	alpha, %mm0
687	por	alpha, %mm2
688 */
689 	por %mm3, %mm0
690 	por %mm3, %mm2
691
692	packuswb	%mm2, %mm0
693
694	movq	%mm0, (%edx)
695	addl	$8, %edx
696
697	subl	$1, %ecx
698.L02:
699	jne	.L03
700
701
702	/* At this point there can be at most 3 pixels left to process.  If
703	 * there is either 2 or 3 left, process 2.
704         */
705
706	movl	12(%esp), %ecx
707	testl	$0x02, %ecx
708	je	.L04
709
710	movd	(%eax), %mm4
711	addl	$4, %eax
712
713	pshufw	$0x00, %mm4, %mm0
714	pshufw	$0x55, %mm4, %mm2
715
716	pand	%mm5, %mm0
717	pand	%mm5, %mm2
718	pmullw	%mm6, %mm0
719	pmullw	%mm6, %mm2
720#if SCALE_ADJUST > 0
721	psrlw	$SCALE_ADJUST, %mm0
722	psrlw	$SCALE_ADJUST, %mm2
723#endif
724	pmulhuw	%mm7, %mm0
725	pmulhuw	%mm7, %mm2
726
727/* Kevin F. Quinn 2nd July 2006
728 * Replace data segment constants with text-segment instructions
729	por	alpha, %mm0
730	por	alpha, %mm2
731 */
732 	por %mm3, %mm0
733 	por %mm3, %mm2
734
735	packuswb	%mm2, %mm0
736
737	movq	%mm0, (%edx)
738	addl	$8, %edx
739
740.L04:
741	/* At this point there can be at most 1 pixel left to process.
742	 * Process it if needed.
743         */
744
745	testl	$0x01, %ecx
746	je	.L01
747
748	movzxw	(%eax), %ecx
749	movd	%ecx, %mm4
750
751	pshufw	$0x00, %mm4, %mm0
752
753	pand	%mm5, %mm0
754	pmullw	%mm6, %mm0
755#if SCALE_ADJUST > 0
756	psrlw	$SCALE_ADJUST, %mm0
757#endif
758	pmulhuw	%mm7, %mm0
759
760/* Kevin F. Quinn 2nd July 2006
761 * Replace data segment constants with text-segment instructions
762	por	alpha, %mm0
763 */
764 	por %mm3, %mm0
765
766	packuswb	%mm0, %mm0
767
768	movd	%mm0, (%edx)
769
770.L01:
771#ifdef USE_INNER_EMMS
772	emms
773#endif
774	ret
775#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) */
776
777#if defined (__ELF__) && defined (__linux__)
778	.section .note.GNU-stack,"",%progbits
779#endif
780