read_rgba_span_x86.S revision ea3885812704645944752887d892c38a46710956
1/*
2 * (C) Copyright IBM Corporation 2004
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25/**
26 * \file read_rgba_span_x86.S
27 * Optimized routines to transfer pixel data from the framebuffer to a
28 * buffer in main memory.
29 *
30 * \author Ian Romanick <idr@us.ibm.com>
31 */
32
33	.file	"read_rgba_span_x86.S"
34#if !defined(__DJGPP__) && !defined(__MINGW32__) /* this one cries for assyntax.h */
35	.section	.rodata
36	.align 16
37	.type	mask, @object
38	.size	mask, 32
39mask:
40	.long	0xff00ff00
41	.long	0xff00ff00
42	.long	0xff00ff00
43	.long	0xff00ff00
44	.long	0x00ff0000
45	.long	0x00ff0000
46	.long	0x00ff0000
47	.long	0x00ff0000
48
49
50/* I implemented these as macros because the appear in quite a few places,
51 * and I've tweaked them a number of times.  I got tired of changing every
52 * place they appear. :)
53 */
54
55#define DO_ONE_PIXEL() \
56	movl	(%ebx), %eax ; \
57	addl	$4, %ebx ; \
58	bswap	%eax          /* ARGB -> BGRA */ ; \
59	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
60	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
61	addl	$4, %ecx
62
63#define DO_ONE_LAST_PIXEL() \
64	movl	(%ebx), %eax ; \
65	bswap	%eax          /* ARGB -> BGRA */ ; \
66	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
67	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
68
69
70/**
71 * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
72 *
73 * \warning
74 * This function assumes that the caller will issue the EMMS instruction
75 * at the correct places.
76 */
77
78.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
79	.type	_generic_read_RGBA_span_BGRA8888_REV_MMX, @function
80_generic_read_RGBA_span_BGRA8888_REV_MMX:
81	pushl	%ebx
82
83#ifdef USE_INNER_EMMS
84	emms
85#endif
86	movq	mask, %mm1
87	movq	mask+16, %mm2
88
89	movl	8(%esp), %ebx	/* source pointer */
90	movl	16(%esp), %edx	/* number of pixels to copy */
91	movl	12(%esp), %ecx	/* destination pointer */
92
93	testl	%edx, %edx
94	je	.L20		/* Bail if there's nothing to do. */
95
96	movl	%ebx, %eax
97
98	negl	%eax
99	sarl	$2, %eax
100	andl	$1, %eax
101	je	.L17
102
103	subl	%eax, %edx
104	DO_ONE_PIXEL()
105.L17:
106
107	/* Would it be faster to unroll this loop once and process 4 pixels
108	 * per pass, instead of just two?
109	 */
110
111	movl	%edx, %eax
112	shrl	%eax
113	jmp	.L18
114.L19:
115	movq	(%ebx), %mm0
116	addl	$8, %ebx
117
118	/* These 9 instructions do what PSHUFB (if there were such an
119	 * instruction) could do in 1. :(
120	 */
121
122	movq	%mm0, %mm3
123	movq	%mm0, %mm4
124
125	pand	%mm2, %mm3
126	psllq	$16, %mm4
127	psrlq	$16, %mm3
128	pand	%mm2, %mm4
129
130	pand	%mm1, %mm0
131	por	%mm4, %mm3
132	por	%mm3, %mm0
133
134	movq	%mm0, (%ecx)
135	addl	$8, %ecx
136	subl	$1, %eax
137.L18:
138	jne	.L19
139
140#ifdef USE_INNER_EMMS
141	emms
142#endif
143
144	/* At this point there are either 1 or 0 pixels remaining to be
145	 * converted.  Convert the last pixel, if needed.
146	 */
147
148	testl	$1, %edx
149	je	.L20
150
151	DO_ONE_LAST_PIXEL()
152
153.L20:
154	popl	%ebx
155	ret
156	.size	_generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
157
158
159/**
160 * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
161 * instructions are only actually used to read data from the framebuffer.
162 * In practice, the speed-up is pretty small.
163 *
164 * \todo
165 * Do some more testing and determine if there's any reason to have this
166 * function in addition to the MMX version.
167 *
168 * \warning
169 * This function assumes that the caller will issue the EMMS instruction
170 * at the correct places.
171 */
172
173.globl _generic_read_RGBA_span_BGRA8888_REV_SSE
174	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE, @function
175_generic_read_RGBA_span_BGRA8888_REV_SSE:
176	pushl	%esi
177	pushl	%ebx
178	pushl	%ebp
179
180#ifdef USE_INNER_EMMS
181	emms
182#endif
183	movq	mask, %mm1
184	movq	mask+16, %mm2
185
186	movl	16(%esp), %ebx	/* source pointer */
187	movl	24(%esp), %edx	/* number of pixels to copy */
188	movl	20(%esp), %ecx	/* destination pointer */
189
190	movl	%esp, %ebp
191	subl	$16, %esp
192	andl	$0xfffffff0, %esp
193
194	movl	%ebx, %eax
195	movl	%edx, %esi
196
197	negl	%eax
198	andl	$15, %eax
199	sarl	$2, %eax
200	cmpl	%edx, %eax
201	cmovle	%eax, %esi
202
203	subl	%esi, %edx
204
205	testl	$1, %esi
206	je	.L32
207
208	DO_ONE_PIXEL()
209.L32:
210
211	testl	$2, %esi
212	je	.L31
213
214	movq	(%ebx), %mm0
215	addl	$8, %ebx
216
217	movq	%mm0, %mm3
218	movq	%mm0, %mm4
219
220	pand	%mm2, %mm3
221	psllq	$16, %mm4
222	psrlq	$16, %mm3
223	pand	%mm2, %mm4
224
225	pand	%mm1, %mm0
226	por	%mm4, %mm3
227	por	%mm3, %mm0
228
229	movq	%mm0, (%ecx)
230	addl	$8, %ecx
231.L31:
232
233	movl	%edx, %eax
234	shrl	$2, %eax
235	jmp	.L33
236.L34:
237	movaps	(%ebx), %xmm0
238	addl	$16, %ebx
239
240	/* This would be so much better if we could just move directly from
241	 * an SSE register to an MMX register.  Unfortunately, that
242	 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
243	 * instruction.
244	 */
245
246	movaps	%xmm0, (%esp)
247	movq	(%esp), %mm0
248	movq	8(%esp), %mm5
249
250	movq	%mm0, %mm3
251	movq	%mm0, %mm4
252	movq	%mm5, %mm6
253	movq	%mm5, %mm7
254
255	pand	%mm2, %mm3
256	pand	%mm2, %mm6
257
258	psllq	$16, %mm4
259	psllq	$16, %mm7
260
261	psrlq	$16, %mm3
262	psrlq	$16, %mm6
263
264	pand	%mm2, %mm4
265	pand	%mm2, %mm7
266
267	pand	%mm1, %mm0
268	pand	%mm1, %mm5
269
270	por	%mm4, %mm3
271	por	%mm7, %mm6
272
273	por	%mm3, %mm0
274	por	%mm6, %mm5
275
276	movq	%mm0, (%ecx)
277	movq	%mm5, 8(%ecx)
278	addl	$16, %ecx
279
280	subl	$1, %eax
281.L33:
282	jne	.L34
283
284#ifdef USE_INNER_EMMS
285	emms
286#endif
287	movl	%ebp, %esp
288
289	/* At this point there are either [0, 3] pixels remaining to be
290	 * converted.
291	 */
292
293	testl	$2, %edx
294	je	.L36
295
296	movq	(%ebx), %mm0
297	addl	$8, %ebx
298
299	movq	%mm0, %mm3
300	movq	%mm0, %mm4
301
302	pand	%mm2, %mm3
303	psllq	$16, %mm4
304	psrlq	$16, %mm3
305	pand	%mm2, %mm4
306
307	pand	%mm1, %mm0
308	por	%mm4, %mm3
309	por	%mm3, %mm0
310
311	movq	%mm0, (%ecx)
312	addl	$8, %ecx
313.L36:
314
315	testl	$1, %edx
316	je	.L35
317
318	DO_ONE_LAST_PIXEL()
319.L35:
320	popl	%ebp
321	popl	%ebx
322	popl	%esi
323	ret
324	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
325
326
327/**
328 * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
329 */
330
331	.text
332.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
333	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
334_generic_read_RGBA_span_BGRA8888_REV_SSE2:
335	pushl	%esi
336	pushl	%ebx
337
338	movdqa	mask, %xmm1
339	movdqa	mask+16, %xmm2
340
341	movl	12(%esp), %ebx	/* source pointer */
342	movl	20(%esp), %edx	/* number of pixels to copy */
343	movl	16(%esp), %ecx	/* destination pointer */
344
345	movl	%ebx, %eax
346	movl	%edx, %esi
347
348	/* If the source pointer isn't a multiple of 16 we have to process
349	 * a few pixels the "slow" way to get the address aligned for
350	 * the SSE fetch intsructions.
351	 */
352
353	negl	%eax
354	andl	$15, %eax
355	sarl	$2, %eax
356
357	cmpl	%edx, %eax
358	cmovbe	%eax, %esi
359	subl	%esi, %edx
360
361	testl	$1, %esi
362	je	.L41
363
364	DO_ONE_PIXEL()
365.L41:
366	testl	$2, %esi
367	je	.L40
368
369	movq	(%ebx), %xmm0
370	addl	$8, %ebx
371
372	movdqa	%xmm0, %xmm3
373	movdqa	%xmm0, %xmm4
374	andps	%xmm1, %xmm0
375
376	andps	%xmm2, %xmm3
377	pslldq	$2, %xmm4
378	psrldq	$2, %xmm3
379	andps	%xmm2, %xmm4
380
381	orps	%xmm4, %xmm3
382	orps	%xmm3, %xmm0
383
384	movq	%xmm0, (%ecx)
385	addl	$8, %ecx
386.L40:
387
388	/* Would it be worth having a specialized version of this loop for
389	 * the case where the destination is 16-byte aligned?  That version
390	 * would be identical except that it could use movedqa instead of
391	 * movdqu.
392	 */
393
394	movl	%edx, %eax
395	shrl	$2, %eax
396	jmp	.L42
397.L43:
398	movdqa	(%ebx), %xmm0
399	addl	$16, %ebx
400
401	movdqa	%xmm0, %xmm3
402	movdqa	%xmm0, %xmm4
403	andps	%xmm1, %xmm0
404
405	andps	%xmm2, %xmm3
406	pslldq	$2, %xmm4
407	psrldq	$2, %xmm3
408	andps	%xmm2, %xmm4
409
410	orps	%xmm4, %xmm3
411	orps	%xmm3, %xmm0
412
413	movdqu	%xmm0, (%ecx)
414	addl	$16, %ecx
415	subl	$1, %eax
416.L42:
417	jne	.L43
418
419
420	/* There may be upto 3 pixels remaining to be copied.  Take care
421	 * of them now.  We do the 2 pixel case first because the data
422	 * will be aligned.
423	 */
424
425	testl	$2, %edx
426	je	.L47
427
428	movq	(%ebx), %xmm0
429
430	movdqa	%xmm0, %xmm3
431	movdqa	%xmm0, %xmm4
432	andps	%xmm1, %xmm0
433
434	andps	%xmm2, %xmm3
435	pslldq	$2, %xmm4
436	psrldq	$2, %xmm3
437	andps	%xmm2, %xmm4
438
439	orps	%xmm4, %xmm3
440	orps	%xmm3, %xmm0
441
442	movq	%xmm0, (%ecx)
443.L47:
444
445	testl	$1, %edx
446	je	.L46
447
448	DO_ONE_LAST_PIXEL()
449.L46:
450
451	popl	%ebx
452	popl	%esi
453	ret
454	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
455
456
457
458	.section	.rodata
459
460	.align	16
461mask_565:
462	.word	0xf800
463	.word	0x07e0
464	.word	0x001f
465	.word	0x0000
466
467/* Setting SCALE_ADJUST to 5 gives a perfect match with the classic C
468 * implementation in Mesa.  Setting SCALE_ADJUST to 0 is slightly faster but
469 * at a small cost to accuracy.
470 */
471
472#define SCALE_ADJUST	5
473#if SCALE_ADJUST == 5
474prescale:
475	.word	0x0001
476	.word	0x0010
477	.word	0x0200
478	.word	0x0000
479
480scale:
481	.word	0x20e8		/* (0x00ff0000 / 0x000007c0) + 1 */
482	.word	0x40c5		/* (0x00ff0000 / 0x000003f0) + 1 */
483	.word	0x839d		/* (0x00ff0000 / 0x000001f0) + 1 */
484	.word	0x0000
485#elif SCALE_ADJUST == 0
486prescale:
487	.word	0x0001
488	.word	0x0020
489	.word	0x0800
490	.word	0x0000
491
492scale:
493	.word	0x0108		/* (0x00ff0000 / 0x0000f800) + 1 */
494	.word	0x0104		/* (0x00ff0000 / 0x0000fc00) + 1 */
495	.word	0x0108		/* (0x00ff0000 / 0x0000f800) + 1 */
496	.word	0x0000
497#else
498#error SCALE_ADJUST must either be 5 or 0.
499#endif
500
501
502alpha:	.long	0x00000000
503	.long	0x00ff0000
504
505/**
506 * MMX optimized version of the RGB565 to RGBA copy routine.
507 */
508
509	.text
510	.globl	_generic_read_RGBA_span_RGB565_MMX
511	.type	_generic_read_RGBA_span_RGB565_MMX, @function
512
513_generic_read_RGBA_span_RGB565_MMX:
514
515#ifdef USE_INNER_EMMS
516	emms
517#endif
518
519	movl	4(%esp), %eax	/* source pointer */
520	movl	8(%esp), %edx	/* destination pointer */
521	movl	12(%esp), %ecx	/* number of pixels to copy */
522
523	movq	mask_565, %mm5
524	movq	prescale, %mm6
525	movq	scale, %mm7
526
527	shrl	$2, %ecx
528	jmp	.L02
529
530.L03:
531	/* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
532	 * second pixels into the four words of %mm0 and %mm2.
533      	 */
534
535	movq	(%eax), %mm4
536	addl	$8, %eax
537
538	pshufw	$0x00, %mm4, %mm0
539	pshufw	$0x55, %mm4, %mm2
540
541
542	/* Mask the pixels so that each word of each register contains only
543	 * one color component.
544	 */
545
546	pand	%mm5, %mm0
547	pand	%mm5, %mm2
548
549
550	/* Adjust the component values so that they are as small as possible,
551	 * but large enough so that we can multiply them by an unsigned 16-bit
552	 * number and get a value as large as 0x00ff0000.
553 	 */
554
555	pmullw	%mm6, %mm0
556	pmullw	%mm6, %mm2
557#if SCALE_ADJUST > 0
558	psrlw	$SCALE_ADJUST, %mm0
559	psrlw	$SCALE_ADJUST, %mm2
560#endif
561
562	/* Scale the input component values to be on the range
563	 * [0, 0x00ff0000].  This it the real magic of the whole routine.
564	 */
565
566	pmulhuw	%mm7, %mm0
567	pmulhuw	%mm7, %mm2
568
569
570	/* Always set the alpha value to 0xff.
571	 */
572
573	por	alpha, %mm0
574	por	alpha, %mm2
575
576
577	/* Pack the 16-bit values to 8-bit values and store the converted
578	 * pixel data.
579	 */
580
581	packuswb	%mm2, %mm0
582	movq	%mm0, (%edx)
583	addl	$8, %edx
584
585
586
587	pshufw	$0xaa, %mm4, %mm0
588	pshufw	$0xff, %mm4, %mm2
589
590	pand	%mm5, %mm0
591	pand	%mm5, %mm2
592	pmullw	%mm6, %mm0
593	pmullw	%mm6, %mm2
594#if SCALE_ADJUST > 0
595	psrlw	$SCALE_ADJUST, %mm0
596	psrlw	$SCALE_ADJUST, %mm2
597#endif
598	pmulhuw	%mm7, %mm0
599	pmulhuw	%mm7, %mm2
600
601	por	alpha, %mm0
602	por	alpha, %mm2
603
604	packuswb	%mm2, %mm0
605
606	movq	%mm0, (%edx)
607	addl	$8, %edx
608
609	subl	$1, %ecx
610.L02:
611	jne	.L03
612
613
614	/* At this point there can be at most 3 pixels left to process.  If
615	 * there is either 2 or 3 left, process 2.
616         */
617
618	movl	12(%esp), %ecx
619	testl	$0x02, %ecx
620	je	.L04
621
622	movd	(%eax), %mm4
623	addl	$4, %eax
624
625	pshufw	$0x00, %mm4, %mm0
626	pshufw	$0x55, %mm4, %mm2
627
628	pand	%mm5, %mm0
629	pand	%mm5, %mm2
630	pmullw	%mm6, %mm0
631	pmullw	%mm6, %mm2
632#if SCALE_ADJUST > 0
633	psrlw	$SCALE_ADJUST, %mm0
634	psrlw	$SCALE_ADJUST, %mm2
635#endif
636	pmulhuw	%mm7, %mm0
637	pmulhuw	%mm7, %mm2
638
639	por	alpha, %mm0
640	por	alpha, %mm2
641
642	packuswb	%mm2, %mm0
643
644	movq	%mm0, (%edx)
645	addl	$8, %edx
646
647.L04:
648	/* At this point there can be at most 1 pixel left to process.
649	 * Process it if needed.
650         */
651
652	testl	$0x01, %ecx
653	je	.L01
654
655	movzxw	(%eax), %ecx
656	movd	%ecx, %mm4
657
658	pshufw	$0x00, %mm4, %mm0
659
660	pand	%mm5, %mm0
661	pmullw	%mm6, %mm0
662#if SCALE_ADJUST > 0
663	psrlw	$SCALE_ADJUST, %mm0
664#endif
665	pmulhuw	%mm7, %mm0
666
667	por	alpha, %mm0
668
669	packuswb	%mm0, %mm0
670
671	movd	%mm0, (%edx)
672
673.L01:
674#ifdef USE_INNER_EMMS
675	emms
676#endif
677	ret
678#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) */
679