106487945186f546094b78cc7021a2bc1e695c17bIan Romanick/* 206487945186f546094b78cc7021a2bc1e695c17bIan Romanick * (C) Copyright IBM Corporation 2004 306487945186f546094b78cc7021a2bc1e695c17bIan Romanick * All Rights Reserved. 406487945186f546094b78cc7021a2bc1e695c17bIan Romanick * 506487945186f546094b78cc7021a2bc1e695c17bIan Romanick * Permission is hereby granted, free of charge, to any person obtaining a 606487945186f546094b78cc7021a2bc1e695c17bIan Romanick * copy of this software and associated documentation files (the "Software"), 706487945186f546094b78cc7021a2bc1e695c17bIan Romanick * to deal in the Software without restriction, including without limitation 806487945186f546094b78cc7021a2bc1e695c17bIan Romanick * on the rights to use, copy, modify, merge, publish, distribute, sub 906487945186f546094b78cc7021a2bc1e695c17bIan Romanick * license, and/or sell copies of the Software, and to permit persons to whom 1006487945186f546094b78cc7021a2bc1e695c17bIan Romanick * the Software is furnished to do so, subject to the following conditions: 1106487945186f546094b78cc7021a2bc1e695c17bIan Romanick * 1206487945186f546094b78cc7021a2bc1e695c17bIan Romanick * The above copyright notice and this permission notice (including the next 1306487945186f546094b78cc7021a2bc1e695c17bIan Romanick * paragraph) shall be included in all copies or substantial portions of the 1406487945186f546094b78cc7021a2bc1e695c17bIan Romanick * Software. 1506487945186f546094b78cc7021a2bc1e695c17bIan Romanick * 1606487945186f546094b78cc7021a2bc1e695c17bIan Romanick * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1706487945186f546094b78cc7021a2bc1e695c17bIan Romanick * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1806487945186f546094b78cc7021a2bc1e695c17bIan Romanick * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 1906487945186f546094b78cc7021a2bc1e695c17bIan Romanick * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 2006487945186f546094b78cc7021a2bc1e695c17bIan Romanick * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 2106487945186f546094b78cc7021a2bc1e695c17bIan Romanick * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 2206487945186f546094b78cc7021a2bc1e695c17bIan Romanick * USE OR OTHER DEALINGS IN THE SOFTWARE. 2306487945186f546094b78cc7021a2bc1e695c17bIan Romanick */ 2406487945186f546094b78cc7021a2bc1e695c17bIan Romanick 2506487945186f546094b78cc7021a2bc1e695c17bIan Romanick/** 2606487945186f546094b78cc7021a2bc1e695c17bIan Romanick * \file read_rgba_span_x86.S 2706487945186f546094b78cc7021a2bc1e695c17bIan Romanick * Optimized routines to transfer pixel data from the framebuffer to a 2806487945186f546094b78cc7021a2bc1e695c17bIan Romanick * buffer in main memory. 2906487945186f546094b78cc7021a2bc1e695c17bIan Romanick * 3006487945186f546094b78cc7021a2bc1e695c17bIan Romanick * \author Ian Romanick <idr@us.ibm.com> 3106487945186f546094b78cc7021a2bc1e695c17bIan Romanick */ 3206487945186f546094b78cc7021a2bc1e695c17bIan Romanick 3306487945186f546094b78cc7021a2bc1e695c17bIan Romanick .file "read_rgba_span_x86.S" 34e02dc139520fab9f7189e0ae390f72ed674bb7d7Vinson Lee#if !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */ 357d39c1ae76cc7dc6793980fd83db100399ee9179Brian/* Kevin F. Quinn 2nd July 2006 36d540e8e9dfc18063f98a31cb3d078d183cf8fce6Benno Schulenberg * Replaced data segment constants with text-segment instructions. 377d39c1ae76cc7dc6793980fd83db100399ee9179Brian */ 387d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define LOAD_MASK(mvins,m1,m2) \ 397d39c1ae76cc7dc6793980fd83db100399ee9179Brian pushl $0xff00ff00 ;\ 407d39c1ae76cc7dc6793980fd83db100399ee9179Brian pushl $0xff00ff00 ;\ 417d39c1ae76cc7dc6793980fd83db100399ee9179Brian pushl $0xff00ff00 ;\ 427d39c1ae76cc7dc6793980fd83db100399ee9179Brian pushl $0xff00ff00 ;\ 437d39c1ae76cc7dc6793980fd83db100399ee9179Brian mvins (%esp), m1 ;\ 447d39c1ae76cc7dc6793980fd83db100399ee9179Brian pushl $0x00ff0000 ;\ 457d39c1ae76cc7dc6793980fd83db100399ee9179Brian pushl $0x00ff0000 ;\ 467d39c1ae76cc7dc6793980fd83db100399ee9179Brian pushl $0x00ff0000 ;\ 477d39c1ae76cc7dc6793980fd83db100399ee9179Brian pushl $0x00ff0000 ;\ 487d39c1ae76cc7dc6793980fd83db100399ee9179Brian mvins (%esp), m2 ;\ 497d39c1ae76cc7dc6793980fd83db100399ee9179Brian addl $32, %esp 5006487945186f546094b78cc7021a2bc1e695c17bIan Romanick 51d540e8e9dfc18063f98a31cb3d078d183cf8fce6Benno Schulenberg/* I implemented these as macros because they appear in several places, 5206487945186f546094b78cc7021a2bc1e695c17bIan Romanick * and I've tweaked them a number of times. I got tired of changing every 5306487945186f546094b78cc7021a2bc1e695c17bIan Romanick * place they appear. :) 5406487945186f546094b78cc7021a2bc1e695c17bIan Romanick */ 5506487945186f546094b78cc7021a2bc1e695c17bIan Romanick 5606487945186f546094b78cc7021a2bc1e695c17bIan Romanick#define DO_ONE_PIXEL() \ 5706487945186f546094b78cc7021a2bc1e695c17bIan Romanick movl (%ebx), %eax ; \ 5806487945186f546094b78cc7021a2bc1e695c17bIan Romanick addl $4, %ebx ; \ 5906487945186f546094b78cc7021a2bc1e695c17bIan Romanick bswap %eax /* ARGB -> BGRA */ ; \ 6006487945186f546094b78cc7021a2bc1e695c17bIan Romanick rorl $8, %eax /* BGRA -> ABGR */ ; \ 6106487945186f546094b78cc7021a2bc1e695c17bIan Romanick movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \ 6206487945186f546094b78cc7021a2bc1e695c17bIan Romanick addl $4, %ecx 6306487945186f546094b78cc7021a2bc1e695c17bIan Romanick 6406487945186f546094b78cc7021a2bc1e695c17bIan Romanick#define DO_ONE_LAST_PIXEL() \ 6506487945186f546094b78cc7021a2bc1e695c17bIan Romanick movl (%ebx), %eax ; \ 6606487945186f546094b78cc7021a2bc1e695c17bIan Romanick bswap %eax /* ARGB -> BGRA */ ; \ 6706487945186f546094b78cc7021a2bc1e695c17bIan Romanick rorl $8, %eax /* BGRA -> ABGR */ ; \ 6806487945186f546094b78cc7021a2bc1e695c17bIan Romanick movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \ 6906487945186f546094b78cc7021a2bc1e695c17bIan Romanick 7006487945186f546094b78cc7021a2bc1e695c17bIan Romanick 7106487945186f546094b78cc7021a2bc1e695c17bIan Romanick/** 7206487945186f546094b78cc7021a2bc1e695c17bIan Romanick * MMX optimized version of the BGRA8888_REV to RGBA copy routine. 7306487945186f546094b78cc7021a2bc1e695c17bIan Romanick * 7406487945186f546094b78cc7021a2bc1e695c17bIan Romanick * \warning 7506487945186f546094b78cc7021a2bc1e695c17bIan Romanick * This function assumes that the caller will issue the EMMS instruction 7606487945186f546094b78cc7021a2bc1e695c17bIan Romanick * at the correct places. 7706487945186f546094b78cc7021a2bc1e695c17bIan Romanick */ 7806487945186f546094b78cc7021a2bc1e695c17bIan Romanick 7906487945186f546094b78cc7021a2bc1e695c17bIan Romanick.globl _generic_read_RGBA_span_BGRA8888_REV_MMX 80d1e28b22673777fe1290cda899abf73aad02e4aaChristopher James Halse Rogers#ifndef USE_DRICORE 81932dee87e3002be87dc3bcc49efd8ac9ac3e1fa4Brian Paul.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX 82d1e28b22673777fe1290cda899abf73aad02e4aaChristopher James Halse Rogers#endif 8306487945186f546094b78cc7021a2bc1e695c17bIan Romanick .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function 8406487945186f546094b78cc7021a2bc1e695c17bIan Romanick_generic_read_RGBA_span_BGRA8888_REV_MMX: 8506487945186f546094b78cc7021a2bc1e695c17bIan Romanick pushl %ebx 8606487945186f546094b78cc7021a2bc1e695c17bIan Romanick 8706487945186f546094b78cc7021a2bc1e695c17bIan Romanick#ifdef USE_INNER_EMMS 8806487945186f546094b78cc7021a2bc1e695c17bIan Romanick emms 8906487945186f546094b78cc7021a2bc1e695c17bIan Romanick#endif 907d39c1ae76cc7dc6793980fd83db100399ee9179Brian LOAD_MASK(movq,%mm1,%mm2) 9106487945186f546094b78cc7021a2bc1e695c17bIan Romanick 9206487945186f546094b78cc7021a2bc1e695c17bIan Romanick movl 8(%esp), %ebx /* source pointer */ 9306487945186f546094b78cc7021a2bc1e695c17bIan Romanick movl 16(%esp), %edx /* number of pixels to copy */ 9406487945186f546094b78cc7021a2bc1e695c17bIan Romanick movl 12(%esp), %ecx /* destination pointer */ 9506487945186f546094b78cc7021a2bc1e695c17bIan Romanick 9606487945186f546094b78cc7021a2bc1e695c17bIan Romanick testl %edx, %edx 974d652b7855d1f852fd81fcffe7dabf9c685c9a0bRoland Scheidegger jle .L20 /* Bail if there's nothing to do. */ 9806487945186f546094b78cc7021a2bc1e695c17bIan Romanick 9906487945186f546094b78cc7021a2bc1e695c17bIan Romanick movl %ebx, %eax 10006487945186f546094b78cc7021a2bc1e695c17bIan Romanick 10106487945186f546094b78cc7021a2bc1e695c17bIan Romanick negl %eax 10206487945186f546094b78cc7021a2bc1e695c17bIan Romanick sarl $2, %eax 10306487945186f546094b78cc7021a2bc1e695c17bIan Romanick andl $1, %eax 10406487945186f546094b78cc7021a2bc1e695c17bIan Romanick je .L17 10506487945186f546094b78cc7021a2bc1e695c17bIan Romanick 10606487945186f546094b78cc7021a2bc1e695c17bIan Romanick subl %eax, %edx 10706487945186f546094b78cc7021a2bc1e695c17bIan Romanick DO_ONE_PIXEL() 10806487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L17: 10906487945186f546094b78cc7021a2bc1e695c17bIan Romanick 11006487945186f546094b78cc7021a2bc1e695c17bIan Romanick /* Would it be faster to unroll this loop once and process 4 pixels 11106487945186f546094b78cc7021a2bc1e695c17bIan Romanick * per pass, instead of just two? 11206487945186f546094b78cc7021a2bc1e695c17bIan Romanick */ 11306487945186f546094b78cc7021a2bc1e695c17bIan Romanick 11406487945186f546094b78cc7021a2bc1e695c17bIan Romanick movl %edx, %eax 11506487945186f546094b78cc7021a2bc1e695c17bIan Romanick shrl %eax 11606487945186f546094b78cc7021a2bc1e695c17bIan Romanick jmp .L18 11706487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L19: 11806487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq (%ebx), %mm0 11906487945186f546094b78cc7021a2bc1e695c17bIan Romanick addl $8, %ebx 12006487945186f546094b78cc7021a2bc1e695c17bIan Romanick 12106487945186f546094b78cc7021a2bc1e695c17bIan Romanick /* These 9 instructions do what PSHUFB (if there were such an 12206487945186f546094b78cc7021a2bc1e695c17bIan Romanick * instruction) could do in 1. :( 12306487945186f546094b78cc7021a2bc1e695c17bIan Romanick */ 12406487945186f546094b78cc7021a2bc1e695c17bIan Romanick 12506487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq %mm0, %mm3 12606487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq %mm0, %mm4 12706487945186f546094b78cc7021a2bc1e695c17bIan Romanick 12806487945186f546094b78cc7021a2bc1e695c17bIan Romanick pand %mm2, %mm3 12906487945186f546094b78cc7021a2bc1e695c17bIan Romanick psllq $16, %mm4 13006487945186f546094b78cc7021a2bc1e695c17bIan Romanick psrlq $16, %mm3 13106487945186f546094b78cc7021a2bc1e695c17bIan Romanick pand %mm2, %mm4 13206487945186f546094b78cc7021a2bc1e695c17bIan Romanick 13306487945186f546094b78cc7021a2bc1e695c17bIan Romanick pand %mm1, %mm0 13406487945186f546094b78cc7021a2bc1e695c17bIan Romanick por %mm4, %mm3 13506487945186f546094b78cc7021a2bc1e695c17bIan Romanick por %mm3, %mm0 13606487945186f546094b78cc7021a2bc1e695c17bIan Romanick 13706487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq %mm0, (%ecx) 13806487945186f546094b78cc7021a2bc1e695c17bIan Romanick addl $8, %ecx 13906487945186f546094b78cc7021a2bc1e695c17bIan Romanick subl $1, %eax 14006487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L18: 14106487945186f546094b78cc7021a2bc1e695c17bIan Romanick jne .L19 14206487945186f546094b78cc7021a2bc1e695c17bIan Romanick 14306487945186f546094b78cc7021a2bc1e695c17bIan Romanick#ifdef USE_INNER_EMMS 14406487945186f546094b78cc7021a2bc1e695c17bIan Romanick emms 14506487945186f546094b78cc7021a2bc1e695c17bIan Romanick#endif 14606487945186f546094b78cc7021a2bc1e695c17bIan Romanick 14706487945186f546094b78cc7021a2bc1e695c17bIan Romanick /* At this point there are either 1 or 0 pixels remaining to be 14806487945186f546094b78cc7021a2bc1e695c17bIan Romanick * converted. Convert the last pixel, if needed. 14906487945186f546094b78cc7021a2bc1e695c17bIan Romanick */ 15006487945186f546094b78cc7021a2bc1e695c17bIan Romanick 15106487945186f546094b78cc7021a2bc1e695c17bIan Romanick testl $1, %edx 15206487945186f546094b78cc7021a2bc1e695c17bIan Romanick je .L20 15306487945186f546094b78cc7021a2bc1e695c17bIan Romanick 15406487945186f546094b78cc7021a2bc1e695c17bIan Romanick DO_ONE_LAST_PIXEL() 15506487945186f546094b78cc7021a2bc1e695c17bIan Romanick 15606487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L20: 15706487945186f546094b78cc7021a2bc1e695c17bIan Romanick popl %ebx 15806487945186f546094b78cc7021a2bc1e695c17bIan Romanick ret 15906487945186f546094b78cc7021a2bc1e695c17bIan Romanick .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX 16006487945186f546094b78cc7021a2bc1e695c17bIan Romanick 16106487945186f546094b78cc7021a2bc1e695c17bIan Romanick 16206487945186f546094b78cc7021a2bc1e695c17bIan Romanick/** 16306487945186f546094b78cc7021a2bc1e695c17bIan Romanick * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE 16406487945186f546094b78cc7021a2bc1e695c17bIan Romanick * instructions are only actually used to read data from the framebuffer. 16506487945186f546094b78cc7021a2bc1e695c17bIan Romanick * In practice, the speed-up is pretty small. 16606487945186f546094b78cc7021a2bc1e695c17bIan Romanick * 16706487945186f546094b78cc7021a2bc1e695c17bIan Romanick * \todo 16806487945186f546094b78cc7021a2bc1e695c17bIan Romanick * Do some more testing and determine if there's any reason to have this 16906487945186f546094b78cc7021a2bc1e695c17bIan Romanick * function in addition to the MMX version. 17006487945186f546094b78cc7021a2bc1e695c17bIan Romanick * 17106487945186f546094b78cc7021a2bc1e695c17bIan Romanick * \warning 17206487945186f546094b78cc7021a2bc1e695c17bIan Romanick * This function assumes that the caller will issue the EMMS instruction 17306487945186f546094b78cc7021a2bc1e695c17bIan Romanick * at the correct places. 17406487945186f546094b78cc7021a2bc1e695c17bIan Romanick */ 17506487945186f546094b78cc7021a2bc1e695c17bIan Romanick 17606487945186f546094b78cc7021a2bc1e695c17bIan Romanick.globl _generic_read_RGBA_span_BGRA8888_REV_SSE 177d1e28b22673777fe1290cda899abf73aad02e4aaChristopher James Halse Rogers#ifndef USE_DRICORE 178932dee87e3002be87dc3bcc49efd8ac9ac3e1fa4Brian Paul.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE 179d1e28b22673777fe1290cda899abf73aad02e4aaChristopher James Halse Rogers#endif 18006487945186f546094b78cc7021a2bc1e695c17bIan Romanick .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function 18106487945186f546094b78cc7021a2bc1e695c17bIan Romanick_generic_read_RGBA_span_BGRA8888_REV_SSE: 18206487945186f546094b78cc7021a2bc1e695c17bIan Romanick pushl %esi 18306487945186f546094b78cc7021a2bc1e695c17bIan Romanick pushl %ebx 18406487945186f546094b78cc7021a2bc1e695c17bIan Romanick pushl %ebp 18506487945186f546094b78cc7021a2bc1e695c17bIan Romanick 18606487945186f546094b78cc7021a2bc1e695c17bIan Romanick#ifdef USE_INNER_EMMS 18706487945186f546094b78cc7021a2bc1e695c17bIan Romanick emms 18806487945186f546094b78cc7021a2bc1e695c17bIan Romanick#endif 189d540e8e9dfc18063f98a31cb3d078d183cf8fce6Benno Schulenberg 1907d39c1ae76cc7dc6793980fd83db100399ee9179Brian LOAD_MASK(movq,%mm1,%mm2) 19106487945186f546094b78cc7021a2bc1e695c17bIan Romanick 19206487945186f546094b78cc7021a2bc1e695c17bIan Romanick movl 16(%esp), %ebx /* source pointer */ 19306487945186f546094b78cc7021a2bc1e695c17bIan Romanick movl 24(%esp), %edx /* number of pixels to copy */ 19406487945186f546094b78cc7021a2bc1e695c17bIan Romanick movl 20(%esp), %ecx /* destination pointer */ 19506487945186f546094b78cc7021a2bc1e695c17bIan Romanick 1964d652b7855d1f852fd81fcffe7dabf9c685c9a0bRoland Scheidegger testl %edx, %edx 1974d652b7855d1f852fd81fcffe7dabf9c685c9a0bRoland Scheidegger jle .L35 /* Bail if there's nothing to do. */ 1984d652b7855d1f852fd81fcffe7dabf9c685c9a0bRoland Scheidegger 19906487945186f546094b78cc7021a2bc1e695c17bIan Romanick movl %esp, %ebp 20006487945186f546094b78cc7021a2bc1e695c17bIan Romanick subl $16, %esp 20106487945186f546094b78cc7021a2bc1e695c17bIan Romanick andl $0xfffffff0, %esp 20206487945186f546094b78cc7021a2bc1e695c17bIan Romanick 20306487945186f546094b78cc7021a2bc1e695c17bIan Romanick movl %ebx, %eax 20406487945186f546094b78cc7021a2bc1e695c17bIan Romanick movl %edx, %esi 20506487945186f546094b78cc7021a2bc1e695c17bIan Romanick 20606487945186f546094b78cc7021a2bc1e695c17bIan Romanick negl %eax 20706487945186f546094b78cc7021a2bc1e695c17bIan Romanick andl $15, %eax 20806487945186f546094b78cc7021a2bc1e695c17bIan Romanick sarl $2, %eax 20906487945186f546094b78cc7021a2bc1e695c17bIan Romanick cmpl %edx, %eax 21006487945186f546094b78cc7021a2bc1e695c17bIan Romanick cmovle %eax, %esi 21106487945186f546094b78cc7021a2bc1e695c17bIan Romanick 21206487945186f546094b78cc7021a2bc1e695c17bIan Romanick subl %esi, %edx 21306487945186f546094b78cc7021a2bc1e695c17bIan Romanick 21406487945186f546094b78cc7021a2bc1e695c17bIan Romanick testl $1, %esi 21506487945186f546094b78cc7021a2bc1e695c17bIan Romanick je .L32 21606487945186f546094b78cc7021a2bc1e695c17bIan Romanick 21706487945186f546094b78cc7021a2bc1e695c17bIan Romanick DO_ONE_PIXEL() 21806487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L32: 21906487945186f546094b78cc7021a2bc1e695c17bIan Romanick 22006487945186f546094b78cc7021a2bc1e695c17bIan Romanick testl $2, %esi 22106487945186f546094b78cc7021a2bc1e695c17bIan Romanick je .L31 22206487945186f546094b78cc7021a2bc1e695c17bIan Romanick 22306487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq (%ebx), %mm0 22406487945186f546094b78cc7021a2bc1e695c17bIan Romanick addl $8, %ebx 22506487945186f546094b78cc7021a2bc1e695c17bIan Romanick 22606487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq %mm0, %mm3 22706487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq %mm0, %mm4 22806487945186f546094b78cc7021a2bc1e695c17bIan Romanick 22906487945186f546094b78cc7021a2bc1e695c17bIan Romanick pand %mm2, %mm3 23006487945186f546094b78cc7021a2bc1e695c17bIan Romanick psllq $16, %mm4 23106487945186f546094b78cc7021a2bc1e695c17bIan Romanick psrlq $16, %mm3 23206487945186f546094b78cc7021a2bc1e695c17bIan Romanick pand %mm2, %mm4 23306487945186f546094b78cc7021a2bc1e695c17bIan Romanick 23406487945186f546094b78cc7021a2bc1e695c17bIan Romanick pand %mm1, %mm0 23506487945186f546094b78cc7021a2bc1e695c17bIan Romanick por %mm4, %mm3 23606487945186f546094b78cc7021a2bc1e695c17bIan Romanick por %mm3, %mm0 23706487945186f546094b78cc7021a2bc1e695c17bIan Romanick 23806487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq %mm0, (%ecx) 23906487945186f546094b78cc7021a2bc1e695c17bIan Romanick addl $8, %ecx 24006487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L31: 24106487945186f546094b78cc7021a2bc1e695c17bIan Romanick 24206487945186f546094b78cc7021a2bc1e695c17bIan Romanick movl %edx, %eax 24306487945186f546094b78cc7021a2bc1e695c17bIan Romanick shrl $2, %eax 24406487945186f546094b78cc7021a2bc1e695c17bIan Romanick jmp .L33 24506487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L34: 24606487945186f546094b78cc7021a2bc1e695c17bIan Romanick movaps (%ebx), %xmm0 24706487945186f546094b78cc7021a2bc1e695c17bIan Romanick addl $16, %ebx 24806487945186f546094b78cc7021a2bc1e695c17bIan Romanick 24906487945186f546094b78cc7021a2bc1e695c17bIan Romanick /* This would be so much better if we could just move directly from 25006487945186f546094b78cc7021a2bc1e695c17bIan Romanick * an SSE register to an MMX register. Unfortunately, that 25106487945186f546094b78cc7021a2bc1e695c17bIan Romanick * functionality wasn't introduced until SSE2 with the MOVDQ2Q 25206487945186f546094b78cc7021a2bc1e695c17bIan Romanick * instruction. 25306487945186f546094b78cc7021a2bc1e695c17bIan Romanick */ 25406487945186f546094b78cc7021a2bc1e695c17bIan Romanick 25506487945186f546094b78cc7021a2bc1e695c17bIan Romanick movaps %xmm0, (%esp) 25606487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq (%esp), %mm0 25706487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq 8(%esp), %mm5 25806487945186f546094b78cc7021a2bc1e695c17bIan Romanick 25906487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq %mm0, %mm3 26006487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq %mm0, %mm4 26106487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq %mm5, %mm6 26206487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq %mm5, %mm7 26306487945186f546094b78cc7021a2bc1e695c17bIan Romanick 26406487945186f546094b78cc7021a2bc1e695c17bIan Romanick pand %mm2, %mm3 26506487945186f546094b78cc7021a2bc1e695c17bIan Romanick pand %mm2, %mm6 26606487945186f546094b78cc7021a2bc1e695c17bIan Romanick 26706487945186f546094b78cc7021a2bc1e695c17bIan Romanick psllq $16, %mm4 26806487945186f546094b78cc7021a2bc1e695c17bIan Romanick psllq $16, %mm7 26906487945186f546094b78cc7021a2bc1e695c17bIan Romanick 27006487945186f546094b78cc7021a2bc1e695c17bIan Romanick psrlq $16, %mm3 27106487945186f546094b78cc7021a2bc1e695c17bIan Romanick psrlq $16, %mm6 27206487945186f546094b78cc7021a2bc1e695c17bIan Romanick 27306487945186f546094b78cc7021a2bc1e695c17bIan Romanick pand %mm2, %mm4 27406487945186f546094b78cc7021a2bc1e695c17bIan Romanick pand %mm2, %mm7 27506487945186f546094b78cc7021a2bc1e695c17bIan Romanick 27606487945186f546094b78cc7021a2bc1e695c17bIan Romanick pand %mm1, %mm0 27706487945186f546094b78cc7021a2bc1e695c17bIan Romanick pand %mm1, %mm5 27806487945186f546094b78cc7021a2bc1e695c17bIan Romanick 27906487945186f546094b78cc7021a2bc1e695c17bIan Romanick por %mm4, %mm3 28006487945186f546094b78cc7021a2bc1e695c17bIan Romanick por %mm7, %mm6 28106487945186f546094b78cc7021a2bc1e695c17bIan Romanick 28206487945186f546094b78cc7021a2bc1e695c17bIan Romanick por %mm3, %mm0 28306487945186f546094b78cc7021a2bc1e695c17bIan Romanick por %mm6, %mm5 28406487945186f546094b78cc7021a2bc1e695c17bIan Romanick 28506487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq %mm0, (%ecx) 28606487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq %mm5, 8(%ecx) 28706487945186f546094b78cc7021a2bc1e695c17bIan Romanick addl $16, %ecx 28806487945186f546094b78cc7021a2bc1e695c17bIan Romanick 28906487945186f546094b78cc7021a2bc1e695c17bIan Romanick subl $1, %eax 29006487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L33: 29106487945186f546094b78cc7021a2bc1e695c17bIan Romanick jne .L34 29206487945186f546094b78cc7021a2bc1e695c17bIan Romanick 29306487945186f546094b78cc7021a2bc1e695c17bIan Romanick#ifdef USE_INNER_EMMS 29406487945186f546094b78cc7021a2bc1e695c17bIan Romanick emms 29506487945186f546094b78cc7021a2bc1e695c17bIan Romanick#endif 29606487945186f546094b78cc7021a2bc1e695c17bIan Romanick movl %ebp, %esp 29706487945186f546094b78cc7021a2bc1e695c17bIan Romanick 29806487945186f546094b78cc7021a2bc1e695c17bIan Romanick /* At this point there are either [0, 3] pixels remaining to be 29906487945186f546094b78cc7021a2bc1e695c17bIan Romanick * converted. 30006487945186f546094b78cc7021a2bc1e695c17bIan Romanick */ 30106487945186f546094b78cc7021a2bc1e695c17bIan Romanick 30206487945186f546094b78cc7021a2bc1e695c17bIan Romanick testl $2, %edx 30306487945186f546094b78cc7021a2bc1e695c17bIan Romanick je .L36 30406487945186f546094b78cc7021a2bc1e695c17bIan Romanick 30506487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq (%ebx), %mm0 30606487945186f546094b78cc7021a2bc1e695c17bIan Romanick addl $8, %ebx 30706487945186f546094b78cc7021a2bc1e695c17bIan Romanick 30806487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq %mm0, %mm3 30906487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq %mm0, %mm4 31006487945186f546094b78cc7021a2bc1e695c17bIan Romanick 31106487945186f546094b78cc7021a2bc1e695c17bIan Romanick pand %mm2, %mm3 31206487945186f546094b78cc7021a2bc1e695c17bIan Romanick psllq $16, %mm4 31306487945186f546094b78cc7021a2bc1e695c17bIan Romanick psrlq $16, %mm3 31406487945186f546094b78cc7021a2bc1e695c17bIan Romanick pand %mm2, %mm4 31506487945186f546094b78cc7021a2bc1e695c17bIan Romanick 31606487945186f546094b78cc7021a2bc1e695c17bIan Romanick pand %mm1, %mm0 31706487945186f546094b78cc7021a2bc1e695c17bIan Romanick por %mm4, %mm3 31806487945186f546094b78cc7021a2bc1e695c17bIan Romanick por %mm3, %mm0 31906487945186f546094b78cc7021a2bc1e695c17bIan Romanick 32006487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq %mm0, (%ecx) 32106487945186f546094b78cc7021a2bc1e695c17bIan Romanick addl $8, %ecx 32206487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L36: 32306487945186f546094b78cc7021a2bc1e695c17bIan Romanick 32406487945186f546094b78cc7021a2bc1e695c17bIan Romanick testl $1, %edx 32506487945186f546094b78cc7021a2bc1e695c17bIan Romanick je .L35 32606487945186f546094b78cc7021a2bc1e695c17bIan Romanick 32706487945186f546094b78cc7021a2bc1e695c17bIan Romanick DO_ONE_LAST_PIXEL() 32806487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L35: 32906487945186f546094b78cc7021a2bc1e695c17bIan Romanick popl %ebp 33006487945186f546094b78cc7021a2bc1e695c17bIan Romanick popl %ebx 33106487945186f546094b78cc7021a2bc1e695c17bIan Romanick popl %esi 33206487945186f546094b78cc7021a2bc1e695c17bIan Romanick ret 33306487945186f546094b78cc7021a2bc1e695c17bIan Romanick .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE 33406487945186f546094b78cc7021a2bc1e695c17bIan Romanick 33506487945186f546094b78cc7021a2bc1e695c17bIan Romanick 33606487945186f546094b78cc7021a2bc1e695c17bIan Romanick/** 33706487945186f546094b78cc7021a2bc1e695c17bIan Romanick * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine. 33806487945186f546094b78cc7021a2bc1e695c17bIan Romanick */ 33906487945186f546094b78cc7021a2bc1e695c17bIan Romanick 34006487945186f546094b78cc7021a2bc1e695c17bIan Romanick .text 34106487945186f546094b78cc7021a2bc1e695c17bIan Romanick.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2 342d1e28b22673777fe1290cda899abf73aad02e4aaChristopher James Halse Rogers#ifndef USE_DRICORE 343932dee87e3002be87dc3bcc49efd8ac9ac3e1fa4Brian Paul.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2 344d1e28b22673777fe1290cda899abf73aad02e4aaChristopher James Halse Rogers#endif 34506487945186f546094b78cc7021a2bc1e695c17bIan Romanick .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function 34606487945186f546094b78cc7021a2bc1e695c17bIan Romanick_generic_read_RGBA_span_BGRA8888_REV_SSE2: 34706487945186f546094b78cc7021a2bc1e695c17bIan Romanick pushl %esi 34806487945186f546094b78cc7021a2bc1e695c17bIan Romanick pushl %ebx 34906487945186f546094b78cc7021a2bc1e695c17bIan Romanick 35014f0b7ea98d5cb2f805fc22796596ef878ae24cbRoland Scheidegger LOAD_MASK(movdqu,%xmm1,%xmm2) 35106487945186f546094b78cc7021a2bc1e695c17bIan Romanick 35206487945186f546094b78cc7021a2bc1e695c17bIan Romanick movl 12(%esp), %ebx /* source pointer */ 35306487945186f546094b78cc7021a2bc1e695c17bIan Romanick movl 20(%esp), %edx /* number of pixels to copy */ 35406487945186f546094b78cc7021a2bc1e695c17bIan Romanick movl 16(%esp), %ecx /* destination pointer */ 35506487945186f546094b78cc7021a2bc1e695c17bIan Romanick 35606487945186f546094b78cc7021a2bc1e695c17bIan Romanick movl %ebx, %eax 35706487945186f546094b78cc7021a2bc1e695c17bIan Romanick movl %edx, %esi 35806487945186f546094b78cc7021a2bc1e695c17bIan Romanick 3594d652b7855d1f852fd81fcffe7dabf9c685c9a0bRoland Scheidegger testl %edx, %edx 3604d652b7855d1f852fd81fcffe7dabf9c685c9a0bRoland Scheidegger jle .L46 /* Bail if there's nothing to do. */ 3614d652b7855d1f852fd81fcffe7dabf9c685c9a0bRoland Scheidegger 36206487945186f546094b78cc7021a2bc1e695c17bIan Romanick /* If the source pointer isn't a multiple of 16 we have to process 36306487945186f546094b78cc7021a2bc1e695c17bIan Romanick * a few pixels the "slow" way to get the address aligned for 36406487945186f546094b78cc7021a2bc1e695c17bIan Romanick * the SSE fetch intsructions. 36506487945186f546094b78cc7021a2bc1e695c17bIan Romanick */ 36606487945186f546094b78cc7021a2bc1e695c17bIan Romanick 36706487945186f546094b78cc7021a2bc1e695c17bIan Romanick negl %eax 36806487945186f546094b78cc7021a2bc1e695c17bIan Romanick andl $15, %eax 36906487945186f546094b78cc7021a2bc1e695c17bIan Romanick sarl $2, %eax 37006487945186f546094b78cc7021a2bc1e695c17bIan Romanick 37106487945186f546094b78cc7021a2bc1e695c17bIan Romanick cmpl %edx, %eax 37206487945186f546094b78cc7021a2bc1e695c17bIan Romanick cmovbe %eax, %esi 37306487945186f546094b78cc7021a2bc1e695c17bIan Romanick subl %esi, %edx 37406487945186f546094b78cc7021a2bc1e695c17bIan Romanick 37506487945186f546094b78cc7021a2bc1e695c17bIan Romanick testl $1, %esi 37606487945186f546094b78cc7021a2bc1e695c17bIan Romanick je .L41 37706487945186f546094b78cc7021a2bc1e695c17bIan Romanick 37806487945186f546094b78cc7021a2bc1e695c17bIan Romanick DO_ONE_PIXEL() 37906487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L41: 38006487945186f546094b78cc7021a2bc1e695c17bIan Romanick testl $2, %esi 38106487945186f546094b78cc7021a2bc1e695c17bIan Romanick je .L40 38206487945186f546094b78cc7021a2bc1e695c17bIan Romanick 38306487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq (%ebx), %xmm0 38406487945186f546094b78cc7021a2bc1e695c17bIan Romanick addl $8, %ebx 38506487945186f546094b78cc7021a2bc1e695c17bIan Romanick 38606487945186f546094b78cc7021a2bc1e695c17bIan Romanick movdqa %xmm0, %xmm3 38706487945186f546094b78cc7021a2bc1e695c17bIan Romanick movdqa %xmm0, %xmm4 38806487945186f546094b78cc7021a2bc1e695c17bIan Romanick andps %xmm1, %xmm0 38906487945186f546094b78cc7021a2bc1e695c17bIan Romanick 39006487945186f546094b78cc7021a2bc1e695c17bIan Romanick andps %xmm2, %xmm3 39106487945186f546094b78cc7021a2bc1e695c17bIan Romanick pslldq $2, %xmm4 39206487945186f546094b78cc7021a2bc1e695c17bIan Romanick psrldq $2, %xmm3 39306487945186f546094b78cc7021a2bc1e695c17bIan Romanick andps %xmm2, %xmm4 39406487945186f546094b78cc7021a2bc1e695c17bIan Romanick 39506487945186f546094b78cc7021a2bc1e695c17bIan Romanick orps %xmm4, %xmm3 39606487945186f546094b78cc7021a2bc1e695c17bIan Romanick orps %xmm3, %xmm0 39706487945186f546094b78cc7021a2bc1e695c17bIan Romanick 39806487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq %xmm0, (%ecx) 39906487945186f546094b78cc7021a2bc1e695c17bIan Romanick addl $8, %ecx 40006487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L40: 40106487945186f546094b78cc7021a2bc1e695c17bIan Romanick 40206487945186f546094b78cc7021a2bc1e695c17bIan Romanick /* Would it be worth having a specialized version of this loop for 40306487945186f546094b78cc7021a2bc1e695c17bIan Romanick * the case where the destination is 16-byte aligned? That version 40406487945186f546094b78cc7021a2bc1e695c17bIan Romanick * would be identical except that it could use movedqa instead of 40506487945186f546094b78cc7021a2bc1e695c17bIan Romanick * movdqu. 40606487945186f546094b78cc7021a2bc1e695c17bIan Romanick */ 40706487945186f546094b78cc7021a2bc1e695c17bIan Romanick 40806487945186f546094b78cc7021a2bc1e695c17bIan Romanick movl %edx, %eax 40906487945186f546094b78cc7021a2bc1e695c17bIan Romanick shrl $2, %eax 41006487945186f546094b78cc7021a2bc1e695c17bIan Romanick jmp .L42 41106487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L43: 41206487945186f546094b78cc7021a2bc1e695c17bIan Romanick movdqa (%ebx), %xmm0 41306487945186f546094b78cc7021a2bc1e695c17bIan Romanick addl $16, %ebx 41406487945186f546094b78cc7021a2bc1e695c17bIan Romanick 41506487945186f546094b78cc7021a2bc1e695c17bIan Romanick movdqa %xmm0, %xmm3 41606487945186f546094b78cc7021a2bc1e695c17bIan Romanick movdqa %xmm0, %xmm4 41706487945186f546094b78cc7021a2bc1e695c17bIan Romanick andps %xmm1, %xmm0 41806487945186f546094b78cc7021a2bc1e695c17bIan Romanick 41906487945186f546094b78cc7021a2bc1e695c17bIan Romanick andps %xmm2, %xmm3 42006487945186f546094b78cc7021a2bc1e695c17bIan Romanick pslldq $2, %xmm4 42106487945186f546094b78cc7021a2bc1e695c17bIan Romanick psrldq $2, %xmm3 42206487945186f546094b78cc7021a2bc1e695c17bIan Romanick andps %xmm2, %xmm4 42306487945186f546094b78cc7021a2bc1e695c17bIan Romanick 42406487945186f546094b78cc7021a2bc1e695c17bIan Romanick orps %xmm4, %xmm3 42506487945186f546094b78cc7021a2bc1e695c17bIan Romanick orps %xmm3, %xmm0 42606487945186f546094b78cc7021a2bc1e695c17bIan Romanick 42706487945186f546094b78cc7021a2bc1e695c17bIan Romanick movdqu %xmm0, (%ecx) 42806487945186f546094b78cc7021a2bc1e695c17bIan Romanick addl $16, %ecx 42906487945186f546094b78cc7021a2bc1e695c17bIan Romanick subl $1, %eax 43006487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L42: 43106487945186f546094b78cc7021a2bc1e695c17bIan Romanick jne .L43 43206487945186f546094b78cc7021a2bc1e695c17bIan Romanick 43306487945186f546094b78cc7021a2bc1e695c17bIan Romanick 43406487945186f546094b78cc7021a2bc1e695c17bIan Romanick /* There may be upto 3 pixels remaining to be copied. Take care 43506487945186f546094b78cc7021a2bc1e695c17bIan Romanick * of them now. We do the 2 pixel case first because the data 43606487945186f546094b78cc7021a2bc1e695c17bIan Romanick * will be aligned. 43706487945186f546094b78cc7021a2bc1e695c17bIan Romanick */ 43806487945186f546094b78cc7021a2bc1e695c17bIan Romanick 43906487945186f546094b78cc7021a2bc1e695c17bIan Romanick testl $2, %edx 44006487945186f546094b78cc7021a2bc1e695c17bIan Romanick je .L47 44106487945186f546094b78cc7021a2bc1e695c17bIan Romanick 44206487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq (%ebx), %xmm0 4434b7d301c94d33394550322768a9d2232087b2d64Xiang, Haihao addl $8, %ebx 4444b7d301c94d33394550322768a9d2232087b2d64Xiang, Haihao 44506487945186f546094b78cc7021a2bc1e695c17bIan Romanick movdqa %xmm0, %xmm3 44606487945186f546094b78cc7021a2bc1e695c17bIan Romanick movdqa %xmm0, %xmm4 44706487945186f546094b78cc7021a2bc1e695c17bIan Romanick andps %xmm1, %xmm0 44806487945186f546094b78cc7021a2bc1e695c17bIan Romanick 44906487945186f546094b78cc7021a2bc1e695c17bIan Romanick andps %xmm2, %xmm3 45006487945186f546094b78cc7021a2bc1e695c17bIan Romanick pslldq $2, %xmm4 45106487945186f546094b78cc7021a2bc1e695c17bIan Romanick psrldq $2, %xmm3 45206487945186f546094b78cc7021a2bc1e695c17bIan Romanick andps %xmm2, %xmm4 45306487945186f546094b78cc7021a2bc1e695c17bIan Romanick 45406487945186f546094b78cc7021a2bc1e695c17bIan Romanick orps %xmm4, %xmm3 45506487945186f546094b78cc7021a2bc1e695c17bIan Romanick orps %xmm3, %xmm0 45606487945186f546094b78cc7021a2bc1e695c17bIan Romanick 45706487945186f546094b78cc7021a2bc1e695c17bIan Romanick movq %xmm0, (%ecx) 4584b7d301c94d33394550322768a9d2232087b2d64Xiang, Haihao addl $8, %ecx 45906487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L47: 46006487945186f546094b78cc7021a2bc1e695c17bIan Romanick 46106487945186f546094b78cc7021a2bc1e695c17bIan Romanick testl $1, %edx 46206487945186f546094b78cc7021a2bc1e695c17bIan Romanick je .L46 46306487945186f546094b78cc7021a2bc1e695c17bIan Romanick 46406487945186f546094b78cc7021a2bc1e695c17bIan Romanick DO_ONE_LAST_PIXEL() 46506487945186f546094b78cc7021a2bc1e695c17bIan Romanick.L46: 46606487945186f546094b78cc7021a2bc1e695c17bIan Romanick 46706487945186f546094b78cc7021a2bc1e695c17bIan Romanick popl %ebx 46806487945186f546094b78cc7021a2bc1e695c17bIan Romanick popl %esi 46906487945186f546094b78cc7021a2bc1e695c17bIan Romanick ret 47006487945186f546094b78cc7021a2bc1e695c17bIan Romanick .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2 471bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 472bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 473bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 4747d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define MASK_565_L 0x07e0f800 4757d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define MASK_565_H 0x0000001f 476d540e8e9dfc18063f98a31cb3d078d183cf8fce6Benno Schulenberg/* Setting SCALE_ADJUST to 5 gives a perfect match with the 477d540e8e9dfc18063f98a31cb3d078d183cf8fce6Benno Schulenberg * classic C implementation in Mesa. Setting SCALE_ADJUST 478d540e8e9dfc18063f98a31cb3d078d183cf8fce6Benno Schulenberg * to 0 is slightly faster but at a small cost to accuracy. 479d540e8e9dfc18063f98a31cb3d078d183cf8fce6Benno Schulenberg */ 4807d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define SCALE_ADJUST 5 4817d39c1ae76cc7dc6793980fd83db100399ee9179Brian#if SCALE_ADJUST == 5 4827d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define PRESCALE_L 0x00100001 4837d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define PRESCALE_H 0x00000200 4847d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define SCALE_L 0x40C620E8 4857d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define SCALE_H 0x0000839d 4867d39c1ae76cc7dc6793980fd83db100399ee9179Brian#elif SCALE_ADJUST == 0 4877d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define PRESCALE_L 0x00200001 4887d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define PRESCALE_H 0x00000800 4897d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define SCALE_L 0x01040108 4907d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define SCALE_H 0x00000108 4917d39c1ae76cc7dc6793980fd83db100399ee9179Brian#else 4927d39c1ae76cc7dc6793980fd83db100399ee9179Brian#error SCALE_ADJUST must either be 5 or 0. 4937d39c1ae76cc7dc6793980fd83db100399ee9179Brian#endif 4947d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define ALPHA_L 0x00000000 4957d39c1ae76cc7dc6793980fd83db100399ee9179Brian#define ALPHA_H 0x00ff0000 496bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 497bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick/** 498bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick * MMX optimized version of the RGB565 to RGBA copy routine. 499bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick */ 500bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 501bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick .text 502bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick .globl _generic_read_RGBA_span_RGB565_MMX 503d1e28b22673777fe1290cda899abf73aad02e4aaChristopher James Halse Rogers#ifndef USE_DRICORE 504932dee87e3002be87dc3bcc49efd8ac9ac3e1fa4Brian Paul .hidden _generic_read_RGBA_span_RGB565_MMX 505d1e28b22673777fe1290cda899abf73aad02e4aaChristopher James Halse Rogers#endif 506bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick .type _generic_read_RGBA_span_RGB565_MMX, @function 507bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 508bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick_generic_read_RGBA_span_RGB565_MMX: 509bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 510bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#ifdef USE_INNER_EMMS 511bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick emms 512bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#endif 513bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 514bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick movl 4(%esp), %eax /* source pointer */ 515bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick movl 8(%esp), %edx /* destination pointer */ 516bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick movl 12(%esp), %ecx /* number of pixels to copy */ 517bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 518a49e0726e3c3cfbf0d784ba4e9a485af02a7b4ddBrian pushl $MASK_565_H 519a49e0726e3c3cfbf0d784ba4e9a485af02a7b4ddBrian pushl $MASK_565_L 5207d39c1ae76cc7dc6793980fd83db100399ee9179Brian movq (%esp), %mm5 521a49e0726e3c3cfbf0d784ba4e9a485af02a7b4ddBrian pushl $PRESCALE_H 522a49e0726e3c3cfbf0d784ba4e9a485af02a7b4ddBrian pushl $PRESCALE_L 5237d39c1ae76cc7dc6793980fd83db100399ee9179Brian movq (%esp), %mm6 524a49e0726e3c3cfbf0d784ba4e9a485af02a7b4ddBrian pushl $SCALE_H 525a49e0726e3c3cfbf0d784ba4e9a485af02a7b4ddBrian pushl $SCALE_L 5267d39c1ae76cc7dc6793980fd83db100399ee9179Brian movq (%esp), %mm7 527a49e0726e3c3cfbf0d784ba4e9a485af02a7b4ddBrian pushl $ALPHA_H 528a49e0726e3c3cfbf0d784ba4e9a485af02a7b4ddBrian pushl $ALPHA_L 5297d39c1ae76cc7dc6793980fd83db100399ee9179Brian movq (%esp), %mm3 5307d39c1ae76cc7dc6793980fd83db100399ee9179Brian addl $32,%esp 531bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 5324d652b7855d1f852fd81fcffe7dabf9c685c9a0bRoland Scheidegger sarl $2, %ecx 533d63c29ef20b26aa90fb310216011d03253e4f09bEric Anholt jl .L01 /* Bail early if the count is negative. */ 534bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick jmp .L02 535bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 536bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick.L03: 537bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and 538bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick * second pixels into the four words of %mm0 and %mm2. 539bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick */ 540bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 541bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick movq (%eax), %mm4 542bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick addl $8, %eax 543bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 544bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pshufw $0x00, %mm4, %mm0 545bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pshufw $0x55, %mm4, %mm2 546bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 547bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 548bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick /* Mask the pixels so that each word of each register contains only 549bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick * one color component. 550bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick */ 551bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 552bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pand %mm5, %mm0 553bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pand %mm5, %mm2 554bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 555bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 556bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick /* Adjust the component values so that they are as small as possible, 557bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick * but large enough so that we can multiply them by an unsigned 16-bit 558bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick * number and get a value as large as 0x00ff0000. 559bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick */ 560bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 561bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pmullw %mm6, %mm0 562bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pmullw %mm6, %mm2 563bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#if SCALE_ADJUST > 0 564bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick psrlw $SCALE_ADJUST, %mm0 565bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick psrlw $SCALE_ADJUST, %mm2 566bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#endif 567bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 568bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick /* Scale the input component values to be on the range 569bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick * [0, 0x00ff0000]. This it the real magic of the whole routine. 570bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick */ 571bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 572bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pmulhuw %mm7, %mm0 573bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pmulhuw %mm7, %mm2 574bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 575bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 576bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick /* Always set the alpha value to 0xff. 577bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick */ 578bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 5797d39c1ae76cc7dc6793980fd83db100399ee9179Brian por %mm3, %mm0 5807d39c1ae76cc7dc6793980fd83db100399ee9179Brian por %mm3, %mm2 581bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 582bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 583bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick /* Pack the 16-bit values to 8-bit values and store the converted 584bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick * pixel data. 585bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick */ 586bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 587bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick packuswb %mm2, %mm0 588bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick movq %mm0, (%edx) 589bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick addl $8, %edx 590bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 591bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pshufw $0xaa, %mm4, %mm0 592bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pshufw $0xff, %mm4, %mm2 593bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 594bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pand %mm5, %mm0 595bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pand %mm5, %mm2 596bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pmullw %mm6, %mm0 597bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pmullw %mm6, %mm2 598bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#if SCALE_ADJUST > 0 599bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick psrlw $SCALE_ADJUST, %mm0 600bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick psrlw $SCALE_ADJUST, %mm2 601bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#endif 602bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pmulhuw %mm7, %mm0 603bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pmulhuw %mm7, %mm2 604bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 6057d39c1ae76cc7dc6793980fd83db100399ee9179Brian por %mm3, %mm0 6067d39c1ae76cc7dc6793980fd83db100399ee9179Brian por %mm3, %mm2 607bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 608bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick packuswb %mm2, %mm0 609bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 610bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick movq %mm0, (%edx) 611bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick addl $8, %edx 612bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 613bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick subl $1, %ecx 614bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick.L02: 615bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick jne .L03 616bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 617bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 618bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick /* At this point there can be at most 3 pixels left to process. If 619bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick * there is either 2 or 3 left, process 2. 620bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick */ 621bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 622bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick movl 12(%esp), %ecx 623bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick testl $0x02, %ecx 624bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick je .L04 625bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 626bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick movd (%eax), %mm4 627bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick addl $4, %eax 628bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 629bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pshufw $0x00, %mm4, %mm0 630bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pshufw $0x55, %mm4, %mm2 631bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 632bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pand %mm5, %mm0 633bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pand %mm5, %mm2 634bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pmullw %mm6, %mm0 635bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pmullw %mm6, %mm2 636bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#if SCALE_ADJUST > 0 637bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick psrlw $SCALE_ADJUST, %mm0 638bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick psrlw $SCALE_ADJUST, %mm2 639bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#endif 640bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pmulhuw %mm7, %mm0 641bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pmulhuw %mm7, %mm2 642bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 6437d39c1ae76cc7dc6793980fd83db100399ee9179Brian por %mm3, %mm0 6447d39c1ae76cc7dc6793980fd83db100399ee9179Brian por %mm3, %mm2 645bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 646bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick packuswb %mm2, %mm0 647bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 648bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick movq %mm0, (%edx) 649bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick addl $8, %edx 650bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 651bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick.L04: 652bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick /* At this point there can be at most 1 pixel left to process. 653bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick * Process it if needed. 654bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick */ 655bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 656bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick testl $0x01, %ecx 657bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick je .L01 658bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 659811ee32a9ef177bec46c82692eeac8bc7297753cDimitry Andric movzwl (%eax), %ecx 660bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick movd %ecx, %mm4 661bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 662bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pshufw $0x00, %mm4, %mm0 663bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 664bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pand %mm5, %mm0 665bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pmullw %mm6, %mm0 666bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#if SCALE_ADJUST > 0 667bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick psrlw $SCALE_ADJUST, %mm0 668bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#endif 669bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick pmulhuw %mm7, %mm0 670bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 6717d39c1ae76cc7dc6793980fd83db100399ee9179Brian por %mm3, %mm0 672bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 673bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick packuswb %mm0, %mm0 674bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 675bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick movd %mm0, (%edx) 676bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick 677bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick.L01: 678bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#ifdef USE_INNER_EMMS 679bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick emms 680bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick#endif 681bdd53efe8302e85fd1be4ceda0aa576e0119b14eIan Romanick ret 682e02dc139520fab9f7189e0ae390f72ed674bb7d7Vinson Lee#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) */ 683fcdc6a7d2488defd66bc7e8398c6d8c9a6190a1aKristian Høgsberg 684fcdc6a7d2488defd66bc7e8398c6d8c9a6190a1aKristian Høgsberg#if defined (__ELF__) && defined (__linux__) 685fcdc6a7d2488defd66bc7e8398c6d8c9a6190a1aKristian Høgsberg .section .note.GNU-stack,"",%progbits 686fcdc6a7d2488defd66bc7e8398c6d8c9a6190a1aKristian Høgsberg#endif 687