106487945186f546094b78cc7021a2bc1e695c17bIan Romanick/*
206487945186f546094b78cc7021a2bc1e695c17bIan Romanick * (C) Copyright IBM Corporation 2004
306487945186f546094b78cc7021a2bc1e695c17bIan Romanick * All Rights Reserved.
406487945186f546094b78cc7021a2bc1e695c17bIan Romanick *
506487945186f546094b78cc7021a2bc1e695c17bIan Romanick * Permission is hereby granted, free of charge, to any person obtaining a
606487945186f546094b78cc7021a2bc1e695c17bIan Romanick * copy of this software and associated documentation files (the "Software"),
706487945186f546094b78cc7021a2bc1e695c17bIan Romanick * to deal in the Software without restriction, including without limitation
806487945186f546094b78cc7021a2bc1e695c17bIan Romanick * on the rights to use, copy, modify, merge, publish, distribute, sub
906487945186f546094b78cc7021a2bc1e695c17bIan Romanick * license, and/or sell copies of the Software, and to permit persons to whom
1006487945186f546094b78cc7021a2bc1e695c17bIan Romanick * the Software is furnished to do so, subject to the following conditions:
1106487945186f546094b78cc7021a2bc1e695c17bIan Romanick *
1206487945186f546094b78cc7021a2bc1e695c17bIan Romanick * The above copyright notice and this permission notice (including the next
1306487945186f546094b78cc7021a2bc1e695c17bIan Romanick * paragraph) shall be included in all copies or substantial portions of the
1406487945186f546094b78cc7021a2bc1e695c17bIan Romanick * Software.
1506487945186f546094b78cc7021a2bc1e695c17bIan Romanick *
1606487945186f546094b78cc7021a2bc1e695c17bIan Romanick * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1706487945186f546094b78cc7021a2bc1e695c17bIan Romanick * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1806487945186f546094b78cc7021a2bc1e695c17bIan Romanick * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
1906487945186f546094b78cc7021a2bc1e695c17bIan Romanick * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
2006487945186f546094b78cc7021a2bc1e695c17bIan Romanick * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
2106487945186f546094b78cc7021a2bc1e695c17bIan Romanick * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
2206487945186f546094b78cc7021a2bc1e695c17bIan Romanick * USE OR OTHER DEALINGS IN THE SOFTWARE.
2306487945186f546094b78cc7021a2bc1e695c17bIan Romanick */
2406487945186f546094b78cc7021a2bc1e695c17bIan Romanick
2506487945186f546094b78cc7021a2bc1e695c17bIan Romanick/**
2606487945186f546094b78cc7021a2bc1e695c17bIan Romanick * \file read_rgba_span_x86.S
2706487945186f546094b78cc7021a2bc1e695c17bIan Romanick * Optimized routines to transfer pixel data from the framebuffer to a
2806487945186f546094b78cc7021a2bc1e695c17bIan Romanick * buffer in main memory.
2906487945186f546094b78cc7021a2bc1e695c17bIan Romanick *
3006487945186f546094b78cc7021a2bc1e695c17bIan Romanick * \author Ian Romanick <idr@us.ibm.com>
3106487945186f546094b78cc7021a2bc1e695c17bIan Romanick */
3206487945186f546094b78cc7021a2bc1e695c17bIan Romanick
3306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	.file	"read_rgba_span_x86.S"
34e02dc139520fab9f7189e0ae390f72ed674bb7d7Vinson Lee#if !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
357d39c1ae76cc7dc6793980fd83db100399ee9179Brian/* Kevin F. Quinn 2nd July 2006
36d540e8e9dfc18063f98a31cb3d078d183cf8fce6Benno Schulenberg * Replaced data segment constants with text-segment instructions.
377d39c1ae76cc7dc6793980fd83db100399ee9179Brian */
387d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define	LOAD_MASK(mvins,m1,m2) \
397d39c1ae76cc7dc6793980fd83db100399ee9179Brian   	pushl	$0xff00ff00 ;\
407d39c1ae76cc7dc6793980fd83db100399ee9179Brian   	pushl	$0xff00ff00 ;\
417d39c1ae76cc7dc6793980fd83db100399ee9179Brian   	pushl	$0xff00ff00 ;\
427d39c1ae76cc7dc6793980fd83db100399ee9179Brian   	pushl	$0xff00ff00 ;\
437d39c1ae76cc7dc6793980fd83db100399ee9179Brian	mvins	(%esp), m1	;\
447d39c1ae76cc7dc6793980fd83db100399ee9179Brian   	pushl	$0x00ff0000 ;\
457d39c1ae76cc7dc6793980fd83db100399ee9179Brian   	pushl	$0x00ff0000 ;\
467d39c1ae76cc7dc6793980fd83db100399ee9179Brian   	pushl	$0x00ff0000 ;\
477d39c1ae76cc7dc6793980fd83db100399ee9179Brian   	pushl	$0x00ff0000 ;\
487d39c1ae76cc7dc6793980fd83db100399ee9179Brian	mvins	(%esp), m2	;\
497d39c1ae76cc7dc6793980fd83db100399ee9179Brian	addl	$32, %esp
5006487945186f546094b78cc7021a2bc1e695c17bIan Romanick
51d540e8e9dfc18063f98a31cb3d078d183cf8fce6Benno Schulenberg/* I implemented these as macros because they appear in several places,
5206487945186f546094b78cc7021a2bc1e695c17bIan Romanick * and I've tweaked them a number of times.  I got tired of changing every
5306487945186f546094b78cc7021a2bc1e695c17bIan Romanick * place they appear. :)
5406487945186f546094b78cc7021a2bc1e695c17bIan Romanick */
5506487945186f546094b78cc7021a2bc1e695c17bIan Romanick
5606487945186f546094b78cc7021a2bc1e695c17bIan Romanick#define DO_ONE_PIXEL() \
5706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movl	(%ebx), %eax ; \
5806487945186f546094b78cc7021a2bc1e695c17bIan Romanick	addl	$4, %ebx ; \
5906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	bswap	%eax          /* ARGB -> BGRA */ ; \
6006487945186f546094b78cc7021a2bc1e695c17bIan Romanick	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
6106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
6206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	addl	$4, %ecx
6306487945186f546094b78cc7021a2bc1e695c17bIan Romanick
6406487945186f546094b78cc7021a2bc1e695c17bIan Romanick#define DO_ONE_LAST_PIXEL() \
6506487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movl	(%ebx), %eax ; \
6606487945186f546094b78cc7021a2bc1e695c17bIan Romanick	bswap	%eax          /* ARGB -> BGRA */ ; \
6706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
6806487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
6906487945186f546094b78cc7021a2bc1e695c17bIan Romanick
7006487945186f546094b78cc7021a2bc1e695c17bIan Romanick
7106487945186f546094b78cc7021a2bc1e695c17bIan Romanick/**
7206487945186f546094b78cc7021a2bc1e695c17bIan Romanick * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
7306487945186f546094b78cc7021a2bc1e695c17bIan Romanick *
7406487945186f546094b78cc7021a2bc1e695c17bIan Romanick * \warning
7506487945186f546094b78cc7021a2bc1e695c17bIan Romanick * This function assumes that the caller will issue the EMMS instruction
7606487945186f546094b78cc7021a2bc1e695c17bIan Romanick * at the correct places.
7706487945186f546094b78cc7021a2bc1e695c17bIan Romanick */
7806487945186f546094b78cc7021a2bc1e695c17bIan Romanick
7906487945186f546094b78cc7021a2bc1e695c17bIan Romanick.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
80d1e28b22673777fe1290cda899abf73aad02e4aaChristopher James Halse Rogers#ifndef USE_DRICORE
81932dee87e3002be87dc3bcc49efd8ac9ac3e1fa4Brian Paul.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
82d1e28b22673777fe1290cda899abf73aad02e4aaChristopher James Halse Rogers#endif
8306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	.type	_generic_read_RGBA_span_BGRA8888_REV_MMX, @function
8406487945186f546094b78cc7021a2bc1e695c17bIan Romanick_generic_read_RGBA_span_BGRA8888_REV_MMX:
8506487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pushl	%ebx
8606487945186f546094b78cc7021a2bc1e695c17bIan Romanick
8706487945186f546094b78cc7021a2bc1e695c17bIan Romanick#ifdef USE_INNER_EMMS
8806487945186f546094b78cc7021a2bc1e695c17bIan Romanick	emms
8906487945186f546094b78cc7021a2bc1e695c17bIan Romanick#endif
907d39c1ae76cc7dc6793980fd83db100399ee9179Brian	LOAD_MASK(movq,%mm1,%mm2)
9106487945186f546094b78cc7021a2bc1e695c17bIan Romanick
9206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movl	8(%esp), %ebx	/* source pointer */
9306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movl	16(%esp), %edx	/* number of pixels to copy */
9406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movl	12(%esp), %ecx	/* destination pointer */
9506487945186f546094b78cc7021a2bc1e695c17bIan Romanick
9606487945186f546094b78cc7021a2bc1e695c17bIan Romanick	testl	%edx, %edx
974d652b7855d1f852fd81fcffe7dabf9c685c9a0bRoland Scheidegger	jle	.L20		/* Bail if there's nothing to do. */
9806487945186f546094b78cc7021a2bc1e695c17bIan Romanick
9906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movl	%ebx, %eax
10006487945186f546094b78cc7021a2bc1e695c17bIan Romanick
10106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	negl	%eax
10206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	sarl	$2, %eax
10306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	andl	$1, %eax
10406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	je	.L17
10506487945186f546094b78cc7021a2bc1e695c17bIan Romanick
10606487945186f546094b78cc7021a2bc1e695c17bIan Romanick	subl	%eax, %edx
10706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	DO_ONE_PIXEL()
10806487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L17:
10906487945186f546094b78cc7021a2bc1e695c17bIan Romanick
11006487945186f546094b78cc7021a2bc1e695c17bIan Romanick	/* Would it be faster to unroll this loop once and process 4 pixels
11106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	 * per pass, instead of just two?
11206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	 */
11306487945186f546094b78cc7021a2bc1e695c17bIan Romanick
11406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movl	%edx, %eax
11506487945186f546094b78cc7021a2bc1e695c17bIan Romanick	shrl	%eax
11606487945186f546094b78cc7021a2bc1e695c17bIan Romanick	jmp	.L18
11706487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L19:
11806487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	(%ebx), %mm0
11906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	addl	$8, %ebx
12006487945186f546094b78cc7021a2bc1e695c17bIan Romanick
12106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	/* These 9 instructions do what PSHUFB (if there were such an
12206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	 * instruction) could do in 1. :(
12306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	 */
12406487945186f546094b78cc7021a2bc1e695c17bIan Romanick
12506487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	%mm0, %mm3
12606487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	%mm0, %mm4
12706487945186f546094b78cc7021a2bc1e695c17bIan Romanick
12806487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pand	%mm2, %mm3
12906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	psllq	$16, %mm4
13006487945186f546094b78cc7021a2bc1e695c17bIan Romanick	psrlq	$16, %mm3
13106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pand	%mm2, %mm4
13206487945186f546094b78cc7021a2bc1e695c17bIan Romanick
13306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pand	%mm1, %mm0
13406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	por	%mm4, %mm3
13506487945186f546094b78cc7021a2bc1e695c17bIan Romanick	por	%mm3, %mm0
13606487945186f546094b78cc7021a2bc1e695c17bIan Romanick
13706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	%mm0, (%ecx)
13806487945186f546094b78cc7021a2bc1e695c17bIan Romanick	addl	$8, %ecx
13906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	subl	$1, %eax
14006487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L18:
14106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	jne	.L19
14206487945186f546094b78cc7021a2bc1e695c17bIan Romanick
14306487945186f546094b78cc7021a2bc1e695c17bIan Romanick#ifdef USE_INNER_EMMS
14406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	emms
14506487945186f546094b78cc7021a2bc1e695c17bIan Romanick#endif
14606487945186f546094b78cc7021a2bc1e695c17bIan Romanick
14706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	/* At this point there are either 1 or 0 pixels remaining to be
14806487945186f546094b78cc7021a2bc1e695c17bIan Romanick	 * converted.  Convert the last pixel, if needed.
14906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	 */
15006487945186f546094b78cc7021a2bc1e695c17bIan Romanick
15106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	testl	$1, %edx
15206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	je	.L20
15306487945186f546094b78cc7021a2bc1e695c17bIan Romanick
15406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	DO_ONE_LAST_PIXEL()
15506487945186f546094b78cc7021a2bc1e695c17bIan Romanick
15606487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L20:
15706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	popl	%ebx
15806487945186f546094b78cc7021a2bc1e695c17bIan Romanick	ret
15906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	.size	_generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
16006487945186f546094b78cc7021a2bc1e695c17bIan Romanick
16106487945186f546094b78cc7021a2bc1e695c17bIan Romanick
16206487945186f546094b78cc7021a2bc1e695c17bIan Romanick/**
16306487945186f546094b78cc7021a2bc1e695c17bIan Romanick * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
16406487945186f546094b78cc7021a2bc1e695c17bIan Romanick * instructions are only actually used to read data from the framebuffer.
16506487945186f546094b78cc7021a2bc1e695c17bIan Romanick * In practice, the speed-up is pretty small.
16606487945186f546094b78cc7021a2bc1e695c17bIan Romanick *
16706487945186f546094b78cc7021a2bc1e695c17bIan Romanick * \todo
16806487945186f546094b78cc7021a2bc1e695c17bIan Romanick * Do some more testing and determine if there's any reason to have this
16906487945186f546094b78cc7021a2bc1e695c17bIan Romanick * function in addition to the MMX version.
17006487945186f546094b78cc7021a2bc1e695c17bIan Romanick *
17106487945186f546094b78cc7021a2bc1e695c17bIan Romanick * \warning
17206487945186f546094b78cc7021a2bc1e695c17bIan Romanick * This function assumes that the caller will issue the EMMS instruction
17306487945186f546094b78cc7021a2bc1e695c17bIan Romanick * at the correct places.
17406487945186f546094b78cc7021a2bc1e695c17bIan Romanick */
17506487945186f546094b78cc7021a2bc1e695c17bIan Romanick
17606487945186f546094b78cc7021a2bc1e695c17bIan Romanick.globl _generic_read_RGBA_span_BGRA8888_REV_SSE
177d1e28b22673777fe1290cda899abf73aad02e4aaChristopher James Halse Rogers#ifndef USE_DRICORE
178932dee87e3002be87dc3bcc49efd8ac9ac3e1fa4Brian Paul.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
179d1e28b22673777fe1290cda899abf73aad02e4aaChristopher James Halse Rogers#endif
18006487945186f546094b78cc7021a2bc1e695c17bIan Romanick	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE, @function
18106487945186f546094b78cc7021a2bc1e695c17bIan Romanick_generic_read_RGBA_span_BGRA8888_REV_SSE:
18206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pushl	%esi
18306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pushl	%ebx
18406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pushl	%ebp
18506487945186f546094b78cc7021a2bc1e695c17bIan Romanick
18606487945186f546094b78cc7021a2bc1e695c17bIan Romanick#ifdef USE_INNER_EMMS
18706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	emms
18806487945186f546094b78cc7021a2bc1e695c17bIan Romanick#endif
189d540e8e9dfc18063f98a31cb3d078d183cf8fce6Benno Schulenberg
1907d39c1ae76cc7dc6793980fd83db100399ee9179Brian	LOAD_MASK(movq,%mm1,%mm2)
19106487945186f546094b78cc7021a2bc1e695c17bIan Romanick
19206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movl	16(%esp), %ebx	/* source pointer */
19306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movl	24(%esp), %edx	/* number of pixels to copy */
19406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movl	20(%esp), %ecx	/* destination pointer */
19506487945186f546094b78cc7021a2bc1e695c17bIan Romanick
1964d652b7855d1f852fd81fcffe7dabf9c685c9a0bRoland Scheidegger	testl	%edx, %edx
1974d652b7855d1f852fd81fcffe7dabf9c685c9a0bRoland Scheidegger	jle	.L35		/* Bail if there's nothing to do. */
1984d652b7855d1f852fd81fcffe7dabf9c685c9a0bRoland Scheidegger
19906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movl	%esp, %ebp
20006487945186f546094b78cc7021a2bc1e695c17bIan Romanick	subl	$16, %esp
20106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	andl	$0xfffffff0, %esp
20206487945186f546094b78cc7021a2bc1e695c17bIan Romanick
20306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movl	%ebx, %eax
20406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movl	%edx, %esi
20506487945186f546094b78cc7021a2bc1e695c17bIan Romanick
20606487945186f546094b78cc7021a2bc1e695c17bIan Romanick	negl	%eax
20706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	andl	$15, %eax
20806487945186f546094b78cc7021a2bc1e695c17bIan Romanick	sarl	$2, %eax
20906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	cmpl	%edx, %eax
21006487945186f546094b78cc7021a2bc1e695c17bIan Romanick	cmovle	%eax, %esi
21106487945186f546094b78cc7021a2bc1e695c17bIan Romanick
21206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	subl	%esi, %edx
21306487945186f546094b78cc7021a2bc1e695c17bIan Romanick
21406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	testl	$1, %esi
21506487945186f546094b78cc7021a2bc1e695c17bIan Romanick	je	.L32
21606487945186f546094b78cc7021a2bc1e695c17bIan Romanick
21706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	DO_ONE_PIXEL()
21806487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L32:
21906487945186f546094b78cc7021a2bc1e695c17bIan Romanick
22006487945186f546094b78cc7021a2bc1e695c17bIan Romanick	testl	$2, %esi
22106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	je	.L31
22206487945186f546094b78cc7021a2bc1e695c17bIan Romanick
22306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	(%ebx), %mm0
22406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	addl	$8, %ebx
22506487945186f546094b78cc7021a2bc1e695c17bIan Romanick
22606487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	%mm0, %mm3
22706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	%mm0, %mm4
22806487945186f546094b78cc7021a2bc1e695c17bIan Romanick
22906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pand	%mm2, %mm3
23006487945186f546094b78cc7021a2bc1e695c17bIan Romanick	psllq	$16, %mm4
23106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	psrlq	$16, %mm3
23206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pand	%mm2, %mm4
23306487945186f546094b78cc7021a2bc1e695c17bIan Romanick
23406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pand	%mm1, %mm0
23506487945186f546094b78cc7021a2bc1e695c17bIan Romanick	por	%mm4, %mm3
23606487945186f546094b78cc7021a2bc1e695c17bIan Romanick	por	%mm3, %mm0
23706487945186f546094b78cc7021a2bc1e695c17bIan Romanick
23806487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	%mm0, (%ecx)
23906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	addl	$8, %ecx
24006487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L31:
24106487945186f546094b78cc7021a2bc1e695c17bIan Romanick
24206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movl	%edx, %eax
24306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	shrl	$2, %eax
24406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	jmp	.L33
24506487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L34:
24606487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movaps	(%ebx), %xmm0
24706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	addl	$16, %ebx
24806487945186f546094b78cc7021a2bc1e695c17bIan Romanick
24906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	/* This would be so much better if we could just move directly from
25006487945186f546094b78cc7021a2bc1e695c17bIan Romanick	 * an SSE register to an MMX register.  Unfortunately, that
25106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
25206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	 * instruction.
25306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	 */
25406487945186f546094b78cc7021a2bc1e695c17bIan Romanick
25506487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movaps	%xmm0, (%esp)
25606487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	(%esp), %mm0
25706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	8(%esp), %mm5
25806487945186f546094b78cc7021a2bc1e695c17bIan Romanick
25906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	%mm0, %mm3
26006487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	%mm0, %mm4
26106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	%mm5, %mm6
26206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	%mm5, %mm7
26306487945186f546094b78cc7021a2bc1e695c17bIan Romanick
26406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pand	%mm2, %mm3
26506487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pand	%mm2, %mm6
26606487945186f546094b78cc7021a2bc1e695c17bIan Romanick
26706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	psllq	$16, %mm4
26806487945186f546094b78cc7021a2bc1e695c17bIan Romanick	psllq	$16, %mm7
26906487945186f546094b78cc7021a2bc1e695c17bIan Romanick
27006487945186f546094b78cc7021a2bc1e695c17bIan Romanick	psrlq	$16, %mm3
27106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	psrlq	$16, %mm6
27206487945186f546094b78cc7021a2bc1e695c17bIan Romanick
27306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pand	%mm2, %mm4
27406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pand	%mm2, %mm7
27506487945186f546094b78cc7021a2bc1e695c17bIan Romanick
27606487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pand	%mm1, %mm0
27706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pand	%mm1, %mm5
27806487945186f546094b78cc7021a2bc1e695c17bIan Romanick
27906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	por	%mm4, %mm3
28006487945186f546094b78cc7021a2bc1e695c17bIan Romanick	por	%mm7, %mm6
28106487945186f546094b78cc7021a2bc1e695c17bIan Romanick
28206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	por	%mm3, %mm0
28306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	por	%mm6, %mm5
28406487945186f546094b78cc7021a2bc1e695c17bIan Romanick
28506487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	%mm0, (%ecx)
28606487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	%mm5, 8(%ecx)
28706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	addl	$16, %ecx
28806487945186f546094b78cc7021a2bc1e695c17bIan Romanick
28906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	subl	$1, %eax
29006487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L33:
29106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	jne	.L34
29206487945186f546094b78cc7021a2bc1e695c17bIan Romanick
29306487945186f546094b78cc7021a2bc1e695c17bIan Romanick#ifdef USE_INNER_EMMS
29406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	emms
29506487945186f546094b78cc7021a2bc1e695c17bIan Romanick#endif
29606487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movl	%ebp, %esp
29706487945186f546094b78cc7021a2bc1e695c17bIan Romanick
29806487945186f546094b78cc7021a2bc1e695c17bIan Romanick	/* At this point there are either [0, 3] pixels remaining to be
29906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	 * converted.
30006487945186f546094b78cc7021a2bc1e695c17bIan Romanick	 */
30106487945186f546094b78cc7021a2bc1e695c17bIan Romanick
30206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	testl	$2, %edx
30306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	je	.L36
30406487945186f546094b78cc7021a2bc1e695c17bIan Romanick
30506487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	(%ebx), %mm0
30606487945186f546094b78cc7021a2bc1e695c17bIan Romanick	addl	$8, %ebx
30706487945186f546094b78cc7021a2bc1e695c17bIan Romanick
30806487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	%mm0, %mm3
30906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	%mm0, %mm4
31006487945186f546094b78cc7021a2bc1e695c17bIan Romanick
31106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pand	%mm2, %mm3
31206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	psllq	$16, %mm4
31306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	psrlq	$16, %mm3
31406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pand	%mm2, %mm4
31506487945186f546094b78cc7021a2bc1e695c17bIan Romanick
31606487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pand	%mm1, %mm0
31706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	por	%mm4, %mm3
31806487945186f546094b78cc7021a2bc1e695c17bIan Romanick	por	%mm3, %mm0
31906487945186f546094b78cc7021a2bc1e695c17bIan Romanick
32006487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	%mm0, (%ecx)
32106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	addl	$8, %ecx
32206487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L36:
32306487945186f546094b78cc7021a2bc1e695c17bIan Romanick
32406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	testl	$1, %edx
32506487945186f546094b78cc7021a2bc1e695c17bIan Romanick	je	.L35
32606487945186f546094b78cc7021a2bc1e695c17bIan Romanick
32706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	DO_ONE_LAST_PIXEL()
32806487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L35:
32906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	popl	%ebp
33006487945186f546094b78cc7021a2bc1e695c17bIan Romanick	popl	%ebx
33106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	popl	%esi
33206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	ret
33306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
33406487945186f546094b78cc7021a2bc1e695c17bIan Romanick
33506487945186f546094b78cc7021a2bc1e695c17bIan Romanick
33606487945186f546094b78cc7021a2bc1e695c17bIan Romanick/**
33706487945186f546094b78cc7021a2bc1e695c17bIan Romanick * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
33806487945186f546094b78cc7021a2bc1e695c17bIan Romanick */
33906487945186f546094b78cc7021a2bc1e695c17bIan Romanick
34006487945186f546094b78cc7021a2bc1e695c17bIan Romanick	.text
34106487945186f546094b78cc7021a2bc1e695c17bIan Romanick.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
342d1e28b22673777fe1290cda899abf73aad02e4aaChristopher James Halse Rogers#ifndef USE_DRICORE
343932dee87e3002be87dc3bcc49efd8ac9ac3e1fa4Brian Paul.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
344d1e28b22673777fe1290cda899abf73aad02e4aaChristopher James Halse Rogers#endif
34506487945186f546094b78cc7021a2bc1e695c17bIan Romanick	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
34606487945186f546094b78cc7021a2bc1e695c17bIan Romanick_generic_read_RGBA_span_BGRA8888_REV_SSE2:
34706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pushl	%esi
34806487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pushl	%ebx
34906487945186f546094b78cc7021a2bc1e695c17bIan Romanick
35014f0b7ea98d5cb2f805fc22796596ef878ae24cbRoland Scheidegger	LOAD_MASK(movdqu,%xmm1,%xmm2)
35106487945186f546094b78cc7021a2bc1e695c17bIan Romanick
35206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movl	12(%esp), %ebx	/* source pointer */
35306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movl	20(%esp), %edx	/* number of pixels to copy */
35406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movl	16(%esp), %ecx	/* destination pointer */
35506487945186f546094b78cc7021a2bc1e695c17bIan Romanick
35606487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movl	%ebx, %eax
35706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movl	%edx, %esi
35806487945186f546094b78cc7021a2bc1e695c17bIan Romanick
3594d652b7855d1f852fd81fcffe7dabf9c685c9a0bRoland Scheidegger	testl	%edx, %edx
3604d652b7855d1f852fd81fcffe7dabf9c685c9a0bRoland Scheidegger	jle	.L46		/* Bail if there's nothing to do. */
3614d652b7855d1f852fd81fcffe7dabf9c685c9a0bRoland Scheidegger
36206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	/* If the source pointer isn't a multiple of 16 we have to process
36306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	 * a few pixels the "slow" way to get the address aligned for
36406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	 * the SSE fetch intsructions.
36506487945186f546094b78cc7021a2bc1e695c17bIan Romanick	 */
36606487945186f546094b78cc7021a2bc1e695c17bIan Romanick
36706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	negl	%eax
36806487945186f546094b78cc7021a2bc1e695c17bIan Romanick	andl	$15, %eax
36906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	sarl	$2, %eax
37006487945186f546094b78cc7021a2bc1e695c17bIan Romanick
37106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	cmpl	%edx, %eax
37206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	cmovbe	%eax, %esi
37306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	subl	%esi, %edx
37406487945186f546094b78cc7021a2bc1e695c17bIan Romanick
37506487945186f546094b78cc7021a2bc1e695c17bIan Romanick	testl	$1, %esi
37606487945186f546094b78cc7021a2bc1e695c17bIan Romanick	je	.L41
37706487945186f546094b78cc7021a2bc1e695c17bIan Romanick
37806487945186f546094b78cc7021a2bc1e695c17bIan Romanick	DO_ONE_PIXEL()
37906487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L41:
38006487945186f546094b78cc7021a2bc1e695c17bIan Romanick	testl	$2, %esi
38106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	je	.L40
38206487945186f546094b78cc7021a2bc1e695c17bIan Romanick
38306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	(%ebx), %xmm0
38406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	addl	$8, %ebx
38506487945186f546094b78cc7021a2bc1e695c17bIan Romanick
38606487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movdqa	%xmm0, %xmm3
38706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movdqa	%xmm0, %xmm4
38806487945186f546094b78cc7021a2bc1e695c17bIan Romanick	andps	%xmm1, %xmm0
38906487945186f546094b78cc7021a2bc1e695c17bIan Romanick
39006487945186f546094b78cc7021a2bc1e695c17bIan Romanick	andps	%xmm2, %xmm3
39106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pslldq	$2, %xmm4
39206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	psrldq	$2, %xmm3
39306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	andps	%xmm2, %xmm4
39406487945186f546094b78cc7021a2bc1e695c17bIan Romanick
39506487945186f546094b78cc7021a2bc1e695c17bIan Romanick	orps	%xmm4, %xmm3
39606487945186f546094b78cc7021a2bc1e695c17bIan Romanick	orps	%xmm3, %xmm0
39706487945186f546094b78cc7021a2bc1e695c17bIan Romanick
39806487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	%xmm0, (%ecx)
39906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	addl	$8, %ecx
40006487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L40:
40106487945186f546094b78cc7021a2bc1e695c17bIan Romanick
40206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	/* Would it be worth having a specialized version of this loop for
40306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	 * the case where the destination is 16-byte aligned?  That version
40406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	 * would be identical except that it could use movedqa instead of
40506487945186f546094b78cc7021a2bc1e695c17bIan Romanick	 * movdqu.
40606487945186f546094b78cc7021a2bc1e695c17bIan Romanick	 */
40706487945186f546094b78cc7021a2bc1e695c17bIan Romanick
40806487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movl	%edx, %eax
40906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	shrl	$2, %eax
41006487945186f546094b78cc7021a2bc1e695c17bIan Romanick	jmp	.L42
41106487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L43:
41206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movdqa	(%ebx), %xmm0
41306487945186f546094b78cc7021a2bc1e695c17bIan Romanick	addl	$16, %ebx
41406487945186f546094b78cc7021a2bc1e695c17bIan Romanick
41506487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movdqa	%xmm0, %xmm3
41606487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movdqa	%xmm0, %xmm4
41706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	andps	%xmm1, %xmm0
41806487945186f546094b78cc7021a2bc1e695c17bIan Romanick
41906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	andps	%xmm2, %xmm3
42006487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pslldq	$2, %xmm4
42106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	psrldq	$2, %xmm3
42206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	andps	%xmm2, %xmm4
42306487945186f546094b78cc7021a2bc1e695c17bIan Romanick
42406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	orps	%xmm4, %xmm3
42506487945186f546094b78cc7021a2bc1e695c17bIan Romanick	orps	%xmm3, %xmm0
42606487945186f546094b78cc7021a2bc1e695c17bIan Romanick
42706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movdqu	%xmm0, (%ecx)
42806487945186f546094b78cc7021a2bc1e695c17bIan Romanick	addl	$16, %ecx
42906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	subl	$1, %eax
43006487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L42:
43106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	jne	.L43
43206487945186f546094b78cc7021a2bc1e695c17bIan Romanick
43306487945186f546094b78cc7021a2bc1e695c17bIan Romanick
43406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	/* There may be upto 3 pixels remaining to be copied.  Take care
43506487945186f546094b78cc7021a2bc1e695c17bIan Romanick	 * of them now.  We do the 2 pixel case first because the data
43606487945186f546094b78cc7021a2bc1e695c17bIan Romanick	 * will be aligned.
43706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	 */
43806487945186f546094b78cc7021a2bc1e695c17bIan Romanick
43906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	testl	$2, %edx
44006487945186f546094b78cc7021a2bc1e695c17bIan Romanick	je	.L47
44106487945186f546094b78cc7021a2bc1e695c17bIan Romanick
44206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	(%ebx), %xmm0
4434b7d301c94d33394550322768a9d2232087b2d64Xiang, Haihao	addl	$8, %ebx
4444b7d301c94d33394550322768a9d2232087b2d64Xiang, Haihao
44506487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movdqa	%xmm0, %xmm3
44606487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movdqa	%xmm0, %xmm4
44706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	andps	%xmm1, %xmm0
44806487945186f546094b78cc7021a2bc1e695c17bIan Romanick
44906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	andps	%xmm2, %xmm3
45006487945186f546094b78cc7021a2bc1e695c17bIan Romanick	pslldq	$2, %xmm4
45106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	psrldq	$2, %xmm3
45206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	andps	%xmm2, %xmm4
45306487945186f546094b78cc7021a2bc1e695c17bIan Romanick
45406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	orps	%xmm4, %xmm3
45506487945186f546094b78cc7021a2bc1e695c17bIan Romanick	orps	%xmm3, %xmm0
45606487945186f546094b78cc7021a2bc1e695c17bIan Romanick
45706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	movq	%xmm0, (%ecx)
4584b7d301c94d33394550322768a9d2232087b2d64Xiang, Haihao	addl	$8, %ecx
45906487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L47:
46006487945186f546094b78cc7021a2bc1e695c17bIan Romanick
46106487945186f546094b78cc7021a2bc1e695c17bIan Romanick	testl	$1, %edx
46206487945186f546094b78cc7021a2bc1e695c17bIan Romanick	je	.L46
46306487945186f546094b78cc7021a2bc1e695c17bIan Romanick
46406487945186f546094b78cc7021a2bc1e695c17bIan Romanick	DO_ONE_LAST_PIXEL()
46506487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L46:
46606487945186f546094b78cc7021a2bc1e695c17bIan Romanick
46706487945186f546094b78cc7021a2bc1e695c17bIan Romanick	popl	%ebx
46806487945186f546094b78cc7021a2bc1e695c17bIan Romanick	popl	%esi
46906487945186f546094b78cc7021a2bc1e695c17bIan Romanick	ret
47006487945186f546094b78cc7021a2bc1e695c17bIan Romanick	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
471bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
472bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
473bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
4747d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define MASK_565_L	0x07e0f800
4757d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define MASK_565_H	0x0000001f
476d540e8e9dfc18063f98a31cb3d078d183cf8fce6Benno Schulenberg/* Setting SCALE_ADJUST to 5 gives a perfect match with the
477d540e8e9dfc18063f98a31cb3d078d183cf8fce6Benno Schulenberg * classic C implementation in Mesa.  Setting SCALE_ADJUST
478d540e8e9dfc18063f98a31cb3d078d183cf8fce6Benno Schulenberg * to 0 is slightly faster but at a small cost to accuracy.
479d540e8e9dfc18063f98a31cb3d078d183cf8fce6Benno Schulenberg */
4807d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define SCALE_ADJUST	5
4817d39c1ae76cc7dc6793980fd83db100399ee9179Brian#if SCALE_ADJUST == 5
4827d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define PRESCALE_L 0x00100001
4837d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define PRESCALE_H 0x00000200
4847d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define SCALE_L 0x40C620E8
4857d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define SCALE_H 0x0000839d
4867d39c1ae76cc7dc6793980fd83db100399ee9179Brian#elif SCALE_ADJUST == 0
4877d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define PRESCALE_L 0x00200001
4887d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define PRESCALE_H 0x00000800
4897d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define SCALE_L 0x01040108
4907d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define SCALE_H 0x00000108
4917d39c1ae76cc7dc6793980fd83db100399ee9179Brian#else
4927d39c1ae76cc7dc6793980fd83db100399ee9179Brian#error SCALE_ADJUST must either be 5 or 0.
4937d39c1ae76cc7dc6793980fd83db100399ee9179Brian#endif
4947d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define ALPHA_L 0x00000000
4957d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define ALPHA_H 0x00ff0000
496bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
497bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick/**
498bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick * MMX optimized version of the RGB565 to RGBA copy routine.
499bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick */
500bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
501bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	.text
502bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	.globl	_generic_read_RGBA_span_RGB565_MMX
503d1e28b22673777fe1290cda899abf73aad02e4aaChristopher James Halse Rogers#ifndef USE_DRICORE
504932dee87e3002be87dc3bcc49efd8ac9ac3e1fa4Brian Paul        .hidden _generic_read_RGBA_span_RGB565_MMX
505d1e28b22673777fe1290cda899abf73aad02e4aaChristopher James Halse Rogers#endif
506bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	.type	_generic_read_RGBA_span_RGB565_MMX, @function
507bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
508bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick_generic_read_RGBA_span_RGB565_MMX:
509bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
510bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#ifdef USE_INNER_EMMS
511bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	emms
512bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#endif
513bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
514bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	movl	4(%esp), %eax	/* source pointer */
515bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	movl	8(%esp), %edx	/* destination pointer */
516bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	movl	12(%esp), %ecx	/* number of pixels to copy */
517bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
518a49e0726e3c3cfbf0d784ba4e9a485af02a7b4ddBrian	pushl	$MASK_565_H
519a49e0726e3c3cfbf0d784ba4e9a485af02a7b4ddBrian	pushl	$MASK_565_L
5207d39c1ae76cc7dc6793980fd83db100399ee9179Brian	movq	(%esp), %mm5
521a49e0726e3c3cfbf0d784ba4e9a485af02a7b4ddBrian	pushl	$PRESCALE_H
522a49e0726e3c3cfbf0d784ba4e9a485af02a7b4ddBrian	pushl	$PRESCALE_L
5237d39c1ae76cc7dc6793980fd83db100399ee9179Brian	movq	(%esp), %mm6
524a49e0726e3c3cfbf0d784ba4e9a485af02a7b4ddBrian	pushl	$SCALE_H
525a49e0726e3c3cfbf0d784ba4e9a485af02a7b4ddBrian	pushl	$SCALE_L
5267d39c1ae76cc7dc6793980fd83db100399ee9179Brian	movq	(%esp), %mm7
527a49e0726e3c3cfbf0d784ba4e9a485af02a7b4ddBrian	pushl	$ALPHA_H
528a49e0726e3c3cfbf0d784ba4e9a485af02a7b4ddBrian	pushl	$ALPHA_L
5297d39c1ae76cc7dc6793980fd83db100399ee9179Brian	movq	(%esp), %mm3
5307d39c1ae76cc7dc6793980fd83db100399ee9179Brian	addl	$32,%esp
531bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
5324d652b7855d1f852fd81fcffe7dabf9c685c9a0bRoland Scheidegger	sarl	$2, %ecx
533d63c29ef20b26aa90fb310216011d03253e4f09bEric Anholt	jl	.L01		/* Bail early if the count is negative. */
534bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	jmp	.L02
535bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
536bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick.L03:
537bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	/* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
538bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	 * second pixels into the four words of %mm0 and %mm2.
539bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick      	 */
540bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
541bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	movq	(%eax), %mm4
542bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	addl	$8, %eax
543bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
544bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pshufw	$0x00, %mm4, %mm0
545bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pshufw	$0x55, %mm4, %mm2
546bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
547bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
548bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	/* Mask the pixels so that each word of each register contains only
549bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	 * one color component.
550bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	 */
551bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
552bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pand	%mm5, %mm0
553bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pand	%mm5, %mm2
554bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
555bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
556bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	/* Adjust the component values so that they are as small as possible,
557bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	 * but large enough so that we can multiply them by an unsigned 16-bit
558bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	 * number and get a value as large as 0x00ff0000.
559bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 	 */
560bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
561bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pmullw	%mm6, %mm0
562bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pmullw	%mm6, %mm2
563bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#if SCALE_ADJUST > 0
564bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	psrlw	$SCALE_ADJUST, %mm0
565bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	psrlw	$SCALE_ADJUST, %mm2
566bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#endif
567bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
568bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	/* Scale the input component values to be on the range
569bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	 * [0, 0x00ff0000].  This it the real magic of the whole routine.
570bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	 */
571bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
572bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pmulhuw	%mm7, %mm0
573bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pmulhuw	%mm7, %mm2
574bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
575bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
576bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	/* Always set the alpha value to 0xff.
577bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	 */
578bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
5797d39c1ae76cc7dc6793980fd83db100399ee9179Brian 	por %mm3, %mm0
5807d39c1ae76cc7dc6793980fd83db100399ee9179Brian 	por %mm3, %mm2
581bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
582bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
583bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	/* Pack the 16-bit values to 8-bit values and store the converted
584bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	 * pixel data.
585bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	 */
586bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
587bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	packuswb	%mm2, %mm0
588bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	movq	%mm0, (%edx)
589bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	addl	$8, %edx
590bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
591bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pshufw	$0xaa, %mm4, %mm0
592bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pshufw	$0xff, %mm4, %mm2
593bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
594bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pand	%mm5, %mm0
595bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pand	%mm5, %mm2
596bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pmullw	%mm6, %mm0
597bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pmullw	%mm6, %mm2
598bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#if SCALE_ADJUST > 0
599bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	psrlw	$SCALE_ADJUST, %mm0
600bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	psrlw	$SCALE_ADJUST, %mm2
601bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#endif
602bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pmulhuw	%mm7, %mm0
603bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pmulhuw	%mm7, %mm2
604bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
6057d39c1ae76cc7dc6793980fd83db100399ee9179Brian 	por %mm3, %mm0
6067d39c1ae76cc7dc6793980fd83db100399ee9179Brian 	por %mm3, %mm2
607bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
608bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	packuswb	%mm2, %mm0
609bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
610bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	movq	%mm0, (%edx)
611bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	addl	$8, %edx
612bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
613bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	subl	$1, %ecx
614bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick.L02:
615bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	jne	.L03
616bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
617bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
618bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	/* At this point there can be at most 3 pixels left to process.  If
619bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	 * there is either 2 or 3 left, process 2.
620bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick         */
621bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
622bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	movl	12(%esp), %ecx
623bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	testl	$0x02, %ecx
624bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	je	.L04
625bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
626bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	movd	(%eax), %mm4
627bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	addl	$4, %eax
628bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
629bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pshufw	$0x00, %mm4, %mm0
630bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pshufw	$0x55, %mm4, %mm2
631bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
632bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pand	%mm5, %mm0
633bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pand	%mm5, %mm2
634bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pmullw	%mm6, %mm0
635bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pmullw	%mm6, %mm2
636bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#if SCALE_ADJUST > 0
637bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	psrlw	$SCALE_ADJUST, %mm0
638bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	psrlw	$SCALE_ADJUST, %mm2
639bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#endif
640bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pmulhuw	%mm7, %mm0
641bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pmulhuw	%mm7, %mm2
642bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
6437d39c1ae76cc7dc6793980fd83db100399ee9179Brian 	por %mm3, %mm0
6447d39c1ae76cc7dc6793980fd83db100399ee9179Brian 	por %mm3, %mm2
645bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
646bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	packuswb	%mm2, %mm0
647bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
648bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	movq	%mm0, (%edx)
649bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	addl	$8, %edx
650bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
651bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick.L04:
652bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	/* At this point there can be at most 1 pixel left to process.
653bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	 * Process it if needed.
654bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick         */
655bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
656bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	testl	$0x01, %ecx
657bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	je	.L01
658bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
659811ee32a9ef177bec46c82692eeac8bc7297753cDimitry Andric	movzwl	(%eax), %ecx
660bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	movd	%ecx, %mm4
661bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
662bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pshufw	$0x00, %mm4, %mm0
663bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
664bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pand	%mm5, %mm0
665bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pmullw	%mm6, %mm0
666bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#if SCALE_ADJUST > 0
667bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	psrlw	$SCALE_ADJUST, %mm0
668bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#endif
669bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	pmulhuw	%mm7, %mm0
670bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
6717d39c1ae76cc7dc6793980fd83db100399ee9179Brian 	por %mm3, %mm0
672bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
673bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	packuswb	%mm0, %mm0
674bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
675bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	movd	%mm0, (%edx)
676bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick
677bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick.L01:
678bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#ifdef USE_INNER_EMMS
679bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	emms
680bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#endif
681bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick	ret
682e02dc139520fab9f7189e0ae390f72ed674bb7d7Vinson Lee#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) */
683fcdc6a7d2488defd66bc7e8398c6d8c9a6190a1aKristian Høgsberg
684fcdc6a7d2488defd66bc7e8398c6d8c9a6190a1aKristian Høgsberg#if defined (__ELF__) && defined (__linux__)
685fcdc6a7d2488defd66bc7e8398c6d8c9a6190a1aKristian Høgsberg	.section .note.GNU-stack,"",%progbits
686fcdc6a7d2488defd66bc7e8398c6d8c9a6190a1aKristian Høgsberg#endif
687