19682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/*
29682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    SDL - Simple DirectMedia Layer
39682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    Copyright (C) 1997-2012 Sam Lantinga
49682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
59682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    This library is free software; you can redistribute it and/or
69682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    modify it under the terms of the GNU Lesser General Public
79682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    License as published by the Free Software Foundation; either
89682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    version 2.1 of the License, or (at your option) any later version.
99682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    This library is distributed in the hope that it will be useful,
119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    but WITHOUT ANY WARRANTY; without even the implied warranty of
129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    Lesser General Public License for more details.
149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    You should have received a copy of the GNU Lesser General Public
169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    License along with this library; if not, write to the Free Software
179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    Sam Lantinga
209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    slouken@libsdl.org
219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall*/
229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#include "SDL_config.h"
239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#include "SDL_video.h"
259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#include "SDL_blit.h"
269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/*
289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall   Checking if _mm_free is #defined in malloc.h is is the only way to
309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall   determine if the Processor Pack is installed, as far as I can tell.
319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall*/
329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#if SDL_ASSEMBLY_ROUTINES
349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#  if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall     /* forced MMX to 0...it breaks on most compilers now.  --ryan. */
369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#    define MMX_ASMBLIT 0
379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#    define GCC_ASMBLIT 0
389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#  elif defined(_MSC_VER) && defined(_M_IX86)
399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#    if (_MSC_VER <= 1200)
409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#      include <malloc.h>
419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#      if defined(_mm_free)
429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#          define HAVE_MMINTRIN_H 1
439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#      endif
449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#    else  /* Visual Studio > VC6 always has mmintrin.h */
459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#      define HAVE_MMINTRIN_H 1
469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#    endif
479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#    if HAVE_MMINTRIN_H
489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#      define MMX_ASMBLIT 1
499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#      define MSVC_ASMBLIT 1
509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#    endif
519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#  endif
529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#endif /* SDL_ASSEMBLY_ROUTINES */
539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* Function to check the CPU flags */
559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#include "SDL_cpuinfo.h"
569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#if GCC_ASMBLIT
579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#include "mmx.h"
589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#elif MSVC_ASMBLIT
599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#include <mmintrin.h>
609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#include <mm3dnow.h>
619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#endif
629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* Functions to perform alpha blended blitting */
649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* N->1 blending with per-surface alpha */
669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int width = info->d_width;
699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int height = info->d_height;
709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint8 *src = info->s_pixels;
719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int srcskip = info->s_skip;
729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint8 *dst = info->d_pixels;
739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int dstskip = info->d_skip;
749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint8 *palmap = info->table;
759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	SDL_PixelFormat *srcfmt = info->src;
769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	SDL_PixelFormat *dstfmt = info->dst;
779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int srcbpp = srcfmt->BytesPerPixel;
789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	const unsigned A = srcfmt->alpha;
809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	while ( height-- ) {
829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    DUFFS_LOOP4(
839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    {
849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 Pixel;
859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned sR;
869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned sG;
879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned sB;
889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned dR;
899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned dG;
909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned dB;
919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		dR = dstfmt->palette->colors[*dst].r;
939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		dG = dstfmt->palette->colors[*dst].g;
949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		dB = dstfmt->palette->colors[*dst].b;
959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		dR &= 0xff;
979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		dG &= 0xff;
989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		dB &= 0xff;
999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		/* Pack RGB into 8bit pixel */
1009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		if ( palmap == NULL ) {
1019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    *dst =((dR>>5)<<(3+2))|
1029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			  ((dG>>5)<<(2))|
1039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			  ((dB>>6)<<(0));
1049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		} else {
1059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    *dst = palmap[((dR>>5)<<(3+2))|
1069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				  ((dG>>5)<<(2))  |
1079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				  ((dB>>6)<<(0))];
1089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
1099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		dst++;
1109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		src += srcbpp;
1119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    },
1129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    width);
1139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    src += srcskip;
1149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    dst += dstskip;
1159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
1169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
1179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
1189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* N->1 blending with pixel alpha */
1199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void BlitNto1PixelAlpha(SDL_BlitInfo *info)
1209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
1219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int width = info->d_width;
1229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int height = info->d_height;
1239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint8 *src = info->s_pixels;
1249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int srcskip = info->s_skip;
1259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint8 *dst = info->d_pixels;
1269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int dstskip = info->d_skip;
1279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint8 *palmap = info->table;
1289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	SDL_PixelFormat *srcfmt = info->src;
1299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	SDL_PixelFormat *dstfmt = info->dst;
1309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int srcbpp = srcfmt->BytesPerPixel;
1319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
1329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	/* FIXME: fix alpha bit field expansion here too? */
1339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	while ( height-- ) {
1349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    DUFFS_LOOP4(
1359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    {
1369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 Pixel;
1379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned sR;
1389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned sG;
1399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned sB;
1409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned sA;
1419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned dR;
1429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned dG;
1439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned dB;
1449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
1459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		dR = dstfmt->palette->colors[*dst].r;
1469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		dG = dstfmt->palette->colors[*dst].g;
1479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		dB = dstfmt->palette->colors[*dst].b;
1489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
1499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		dR &= 0xff;
1509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		dG &= 0xff;
1519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		dB &= 0xff;
1529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		/* Pack RGB into 8bit pixel */
1539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		if ( palmap == NULL ) {
1549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    *dst =((dR>>5)<<(3+2))|
1559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			  ((dG>>5)<<(2))|
1569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			  ((dB>>6)<<(0));
1579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		} else {
1589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    *dst = palmap[((dR>>5)<<(3+2))|
1599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				  ((dG>>5)<<(2))  |
1609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				  ((dB>>6)<<(0))  ];
1619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
1629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		dst++;
1639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		src += srcbpp;
1649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    },
1659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    width);
1669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    src += srcskip;
1679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    dst += dstskip;
1689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
1699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
1709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
1719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* colorkeyed N->1 blending with per-surface alpha */
1729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
1739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
1749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int width = info->d_width;
1759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int height = info->d_height;
1769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint8 *src = info->s_pixels;
1779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int srcskip = info->s_skip;
1789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint8 *dst = info->d_pixels;
1799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int dstskip = info->d_skip;
1809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint8 *palmap = info->table;
1819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	SDL_PixelFormat *srcfmt = info->src;
1829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	SDL_PixelFormat *dstfmt = info->dst;
1839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int srcbpp = srcfmt->BytesPerPixel;
1849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 ckey = srcfmt->colorkey;
1859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
1869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	const int A = srcfmt->alpha;
1879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
1889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	while ( height-- ) {
1899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    DUFFS_LOOP(
1909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    {
1919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 Pixel;
1929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned sR;
1939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned sG;
1949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned sB;
1959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned dR;
1969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned dG;
1979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned dB;
1989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
1999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		if ( Pixel != ckey ) {
2009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    dR = dstfmt->palette->colors[*dst].r;
2019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    dG = dstfmt->palette->colors[*dst].g;
2029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    dB = dstfmt->palette->colors[*dst].b;
2039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
2049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    dR &= 0xff;
2059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    dG &= 0xff;
2069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    dB &= 0xff;
2079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    /* Pack RGB into 8bit pixel */
2089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    if ( palmap == NULL ) {
2099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			*dst =((dR>>5)<<(3+2))|
2109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			      ((dG>>5)<<(2)) |
2119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			      ((dB>>6)<<(0));
2129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    } else {
2139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			*dst = palmap[((dR>>5)<<(3+2))|
2149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				      ((dG>>5)<<(2))  |
2159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				      ((dB>>6)<<(0))  ];
2169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    }
2179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
2189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		dst++;
2199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		src += srcbpp;
2209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    },
2219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    width);
2229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    src += srcskip;
2239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    dst += dstskip;
2249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
2259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
2269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
2279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#if GCC_ASMBLIT
2289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
2299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
2309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
2319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int width = info->d_width;
2329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int height = info->d_height;
2339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 *srcp = (Uint32 *)info->s_pixels;
2349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int srcskip = info->s_skip >> 2;
2359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 *dstp = (Uint32 *)info->d_pixels;
2369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int dstskip = info->d_skip >> 2;
2379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 dalpha = info->dst->Amask;
2389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint64 load;
2399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
2409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	load = 0x00fefefe00fefefeULL;/* alpha128 mask */
2419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	movq_m2r(load, mm4); /* alpha128 mask -> mm4 */
2429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	load = 0x0001010100010101ULL;/* !alpha128 mask */
2439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */
2449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	movd_m2r(dalpha, mm7); /* dst alpha mask */
2459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
2469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	while(height--) {
2479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		DUFFS_LOOP_DOUBLE2(
2489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		{
2499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			Uint32 s = *srcp++;
2509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			Uint32 d = *dstp;
2519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
2529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				   + (s & d & 0x00010101)) | dalpha;
2539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		},{
2549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
2559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
2569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
2579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
2589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
2599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
2609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
2619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			pand_r2r(mm4, mm5); /* src & mask -> mm5 */
2629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
2639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			pand_r2r(mm1, mm2); /* src & dst -> mm2 */
2649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
2659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
2669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
2679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
2689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
2699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
2709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dstp += 2;
2719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			srcp += 2;
2729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}, width);
2739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		srcp += srcskip;
2749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		dstp += dstskip;
2759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
2769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	emms();
2779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
2789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
2799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* fast RGB888->(A)RGB888 blending with surface alpha */
2809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
2819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
2829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	SDL_PixelFormat* df = info->dst;
2839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	unsigned alpha = info->src->alpha;
2849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
2859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
2869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			/* only call a128 version when R,G,B occupy lower bits */
2879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		BlitRGBtoRGBSurfaceAlpha128MMX(info);
2889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	} else {
2899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int width = info->d_width;
2909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int height = info->d_height;
2919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 *srcp = (Uint32 *)info->s_pixels;
2929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int srcskip = info->s_skip >> 2;
2939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 *dstp = (Uint32 *)info->d_pixels;
2949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int dstskip = info->d_skip >> 2;
2959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
2969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		pxor_r2r(mm5, mm5); /* 0 -> mm5 */
2979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		/* form the alpha mult */
2989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
2999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
3009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
3019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
3029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
3039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
3049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
3059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			/* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
3069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		movd_m2r(df->Amask, mm7); /* dst alpha mask */
3079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
3089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
3099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		while(height--) {
3109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			DUFFS_LOOP_DOUBLE2({
3119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* One Pixel Blend */
3129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
3139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
3149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
3159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
3169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
3179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
3189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
3199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
3209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
3219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
3229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
3239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
3249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movd_r2m(mm2, *dstp);/* mm2 -> pixel */
3259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				++srcp;
3269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				++dstp;
3279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			},{
3289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* Two Pixels Blend */
3299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
3309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
3319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
3329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
3339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
3349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
3359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
3369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
3379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
3389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
3399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
3409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
3419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
3429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
3439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
3449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
3459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
3469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
3479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
3489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
3499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
3509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
3519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
3529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
3539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
3549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  				srcp += 2;
3559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  				dstp += 2;
3569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  			}, width);
3579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			srcp += srcskip;
3589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dstp += dstskip;
3599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
3609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		emms();
3619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
3629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
3639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
3649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* fast ARGB888->(A)RGB888 blending with pixel alpha */
3659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
3669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
3679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int width = info->d_width;
3689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int height = info->d_height;
3699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 *srcp = (Uint32 *)info->s_pixels;
3709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int srcskip = info->s_skip >> 2;
3719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 *dstp = (Uint32 *)info->d_pixels;
3729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int dstskip = info->d_skip >> 2;
3739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	SDL_PixelFormat* sf = info->src;
3749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 amask = sf->Amask;
3759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
3769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	pxor_r2r(mm6, mm6); /* 0 -> mm6 */
3779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	/* form multiplication mask */
3789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
3799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
3809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
3819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
3829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
3839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	/* form channel masks */
3849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
3859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
3869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
3879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
3889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	/* get alpha channel shift */
3899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	__asm__ __volatile__ (
3909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		"movd %0, %%mm5"
3919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		: : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
3929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
3939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	while(height--) {
3949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    DUFFS_LOOP4({
3959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 alpha = *srcp & amask;
3969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		/* FIXME: Here we special-case opaque alpha since the
3979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			compositioning used (>>8 instead of /255) doesn't handle
3989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			it correctly. Also special-case alpha=0 for speed?
3999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			Benchmark this! */
4009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		if(alpha == 0) {
4019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			/* do nothing */
4029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		} else if(alpha == amask) {
4039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			/* opaque alpha -- copy RGB, keep dst alpha */
4049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			/* using MMX here to free up regular registers for other things */
4059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
4069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
4079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
4089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
4099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			por_r2r(mm1, mm2); /* src | dst -> mm2 */
4109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
4119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		} else {
4129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
4139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
4149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
4159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
4169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
4179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
4189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			__asm__ __volatile__ (
4199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				"movd %0, %%mm4"
4209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				: : "r" (alpha) ); /* 0000A000 -> mm4 */
4219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
4229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
4239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
4249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
4259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
4269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			/* blend */
4279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
4289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
4299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
4309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
4319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
4329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
4339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			movd_r2m(mm2, *dstp);/* mm2 -> dst */
4349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
4359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		++srcp;
4369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		++dstp;
4379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    }, width);
4389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    srcp += srcskip;
4399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    dstp += dstskip;
4409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
4419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	emms();
4429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
4439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* End GCC_ASMBLIT */
4449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
4459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#elif MSVC_ASMBLIT
4469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
4479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
4489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
4499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int width = info->d_width;
4509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int height = info->d_height;
4519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 *srcp = (Uint32 *)info->s_pixels;
4529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int srcskip = info->s_skip >> 2;
4539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 *dstp = (Uint32 *)info->d_pixels;
4549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int dstskip = info->d_skip >> 2;
4559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 dalpha = info->dst->Amask;
4569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
4579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	__m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
4589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
4599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
4609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
4619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
4629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
4639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	while (height--) {
4649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int n = width;
4659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		if ( n & 1 ) {
4669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			Uint32 s = *srcp++;
4679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			Uint32 d = *dstp;
4689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
4699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				   + (s & d & 0x00010101)) | dalpha;
4709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			n--;
4719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
4729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
4739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		for (n >>= 1; n > 0; --n) {
4749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
4759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dst2 = dst1;   /* 2 x dst -> dst2(ARGBARGB) */
4769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
4779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
4789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			src2 = src1; /* 2 x src -> src2(ARGBARGB) */
4799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
4809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
4819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
4829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
4839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
4849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
4859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
4869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
4879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
4889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
4899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
4909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			*(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
4919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dstp += 2;
4929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			srcp += 2;
4939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
4949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
4959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		srcp += srcskip;
4969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		dstp += dstskip;
4979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
4989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	_mm_empty();
4999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
5009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
5019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* fast RGB888->(A)RGB888 blending with surface alpha */
5029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
5039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
5049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	SDL_PixelFormat* df = info->dst;
5059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
5069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	unsigned alpha = info->src->alpha;
5079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
5089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
5099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			/* only call a128 version when R,G,B occupy lower bits */
5109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		BlitRGBtoRGBSurfaceAlpha128MMX(info);
5119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	} else {
5129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int width = info->d_width;
5139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int height = info->d_height;
5149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 *srcp = (Uint32 *)info->s_pixels;
5159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int srcskip = info->s_skip >> 2;
5169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 *dstp = (Uint32 *)info->d_pixels;
5179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int dstskip = info->d_skip >> 2;
5189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 dalpha = df->Amask;
5199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 amult;
5209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
5219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		__m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
5229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
5239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
5249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		/* form the alpha mult */
5259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		amult = alpha | (alpha << 8);
5269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		amult = amult | (amult << 16);
5279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
5289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
5299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
5309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			/* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
5319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
5329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
5339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		while (height--) {
5349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			int n = width;
5359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			if (n & 1) {
5369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* One Pixel Blend */
5379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
5389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
5399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
5409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
5419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
5429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
5439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
5449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
5459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
5469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
5479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
5489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
5499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
5509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
5519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
5529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				++srcp;
5539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				++dstp;
5549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
5559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				n--;
5569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			}
5579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
5589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			for (n >>= 1; n > 0; --n) {
5599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* Two Pixels Blend */
5609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
5619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = src1; /* 2 x src -> src2(ARGBARGB) */
5629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
5639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
5649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
5659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
5669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
5679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
5689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
5699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
5709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
5719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
5729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
5739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
5749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
5759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
5769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
5779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
5789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
5799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
5809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
5819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
5829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
5839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
5849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
5859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				srcp += 2;
5869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dstp += 2;
5879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			}
5889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			srcp += srcskip;
5899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dstp += dstskip;
5909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
5919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		_mm_empty();
5929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
5939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
5949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
5959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* fast ARGB888->(A)RGB888 blending with pixel alpha */
5969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
5979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
5989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int width = info->d_width;
5999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int height = info->d_height;
6009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 *srcp = (Uint32 *)info->s_pixels;
6019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int srcskip = info->s_skip >> 2;
6029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 *dstp = (Uint32 *)info->d_pixels;
6039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int dstskip = info->d_skip >> 2;
6049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	SDL_PixelFormat* sf = info->src;
6059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
6069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 amask = sf->Amask;
6079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 ashift = sf->Ashift;
6089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint64 multmask;
6099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
6109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
6119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
6129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
6139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	multmask = ~(0xFFFFi64 << (ashift * 2));
6149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
6159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
6169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	while(height--) {
6179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		DUFFS_LOOP4({
6189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 alpha = *srcp & amask;
6199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		if (alpha == 0) {
6209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			/* do nothing */
6219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		} else if (alpha == amask) {
6229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			/* opaque alpha -- copy RGB, keep dst alpha */
6239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
6249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		} else {
6259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
6269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
6279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
6289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
6299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
6309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
6319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
6329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
6339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
6349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
6359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
6369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
6379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			/* blend */
6389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
6399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
6409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
6419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
6429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
6439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
6449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
6459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
6469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		++srcp;
6479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		++dstp;
6489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    }, width);
6499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    srcp += srcskip;
6509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    dstp += dstskip;
6519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
6529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	_mm_empty();
6539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
6549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* End MSVC_ASMBLIT */
6559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
6569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
6579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
6589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#if SDL_ALTIVEC_BLITTERS
6599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#if __MWERKS__
6609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#pragma altivec_model on
6619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#endif
6629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#if HAVE_ALTIVEC_H
6639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#include <altivec.h>
6649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#endif
6659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#include <assert.h>
6669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
6679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#if (defined(__MACOSX__) && (__GNUC__ < 4))
6689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
6699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
6709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
6719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        (vector unsigned short) ( a,b,c,d,e,f,g,h )
6729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#else
6739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
6749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
6759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
6769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        (vector unsigned short) { a,b,c,d,e,f,g,h }
6779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#endif
6789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
6799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
6809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#define VECPRINT(msg, v) do { \
6819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned int tmpvec = (vector unsigned int)(v); \
6829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    unsigned int *vp = (unsigned int *)&tmpvec; \
6839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
6849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall} while (0)
6859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
6869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* the permuation vector that takes the high bytes out of all the appropriate shorts
6879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    (vector unsigned char)(
6889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x00, 0x10, 0x02, 0x12,
6899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x04, 0x14, 0x06, 0x16,
6909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x08, 0x18, 0x0A, 0x1A,
6919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x0C, 0x1C, 0x0E, 0x1E );
6929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall*/
6939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
6949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
6959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
6969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
6979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    ? vec_lvsl(0, src) \
6989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
6999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
7009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
7019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
7029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
7039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned short vtemp1 = vec_mule(vs, valpha); \
7049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
7059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
7069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    /* valpha2 is 255-alpha */ \
7079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char valpha2 = vec_nor(valpha, valpha); \
7089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
7099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
7109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
7119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
7129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    /* add source and dest */ \
7139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vtemp1 = vec_add(vtemp1, vtemp3); \
7149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vtemp2 = vec_add(vtemp2, vtemp4); \
7159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
7169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vtemp1 = vec_add(vtemp1, v1_16); \
7179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vtemp3 = vec_sr(vtemp1, v8_16); \
7189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vtemp1 = vec_add(vtemp1, vtemp3); \
7199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
7209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vtemp2 = vec_add(vtemp2, v1_16); \
7219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vtemp4 = vec_sr(vtemp2, v8_16); \
7229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vtemp2 = vec_add(vtemp2, vtemp4); \
7239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    /* (>>8) and get ARGBARGBARGBARGB */ \
7249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
7259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall} while (0)
7269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
7279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* Calculate the permute vector used for 32->32 swizzling */
7289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
7299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                  const SDL_PixelFormat *dstfmt)
7309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
7319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    /*
7329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall     * We have to assume that the bits that aren't used by other
7339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall     *  colors is alpha, and it's one complete byte, since some formats
7349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall     *  leave alpha with a zero mask, but we should still swizzle the bits.
7359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall     */
7369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    /* ARGB */
7379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    const static struct SDL_PixelFormat default_pixel_format = {
7389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        NULL, 0, 0,
7399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0, 0, 0, 0,
7409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        16, 8, 0, 24,
7419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
7429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0, 0};
7439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    if (!srcfmt) {
7449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        srcfmt = &default_pixel_format;
7459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    }
7469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    if (!dstfmt) {
7479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        dstfmt = &default_pixel_format;
7489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    }
7499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    const vector unsigned char plus = VECUINT8_LITERAL
7509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                            ( 0x00, 0x00, 0x00, 0x00,
7519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                              0x04, 0x04, 0x04, 0x04,
7529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                              0x08, 0x08, 0x08, 0x08,
7539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                              0x0C, 0x0C, 0x0C, 0x0C );
7549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char vswiz;
7559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned int srcvec;
7569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#define RESHIFT(X) (3 - ((X) >> 3))
7579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
7589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
7599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
7609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    Uint32 amask;
7619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    /* Use zero for alpha if either surface doesn't have alpha */
7629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    if (dstfmt->Amask) {
7639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
7649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    } else {
7659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
7669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    }
7679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#undef RESHIFT
7689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
7699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
7709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    return(vswiz);
7719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
7729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
7739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
7749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
7759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    int height = info->d_height;
7769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    Uint8 *src = (Uint8 *)info->s_pixels;
7779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    int srcskip = info->s_skip;
7789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    Uint8 *dst = (Uint8 *)info->d_pixels;
7799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    int dstskip = info->d_skip;
7809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    SDL_PixelFormat *srcfmt = info->src;
7819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
7829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char v0 = vec_splat_u8(0);
7839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned short v8_16 = vec_splat_u16(8);
7849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned short v1_16 = vec_splat_u16(1);
7859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned short v2_16 = vec_splat_u16(2);
7869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned short v3_16 = vec_splat_u16(3);
7879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned int v8_32 = vec_splat_u32(8);
7889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned int v16_32 = vec_add(v8_32, v8_32);
7899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned short v3f = VECUINT16_LITERAL(
7909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x003f, 0x003f, 0x003f, 0x003f,
7919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x003f, 0x003f, 0x003f, 0x003f);
7929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned short vfc = VECUINT16_LITERAL(
7939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x00fc, 0x00fc, 0x00fc, 0x00fc,
7949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x00fc, 0x00fc, 0x00fc, 0x00fc);
7959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
7969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    /*
7979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x10 - 0x1f is the alpha
7989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x00 - 0x0e evens are the red
7999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x01 - 0x0f odds are zero
8009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    */
8019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char vredalpha1 = VECUINT8_LITERAL(
8029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x10, 0x00, 0x01, 0x01,
8039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x10, 0x02, 0x01, 0x01,
8049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x10, 0x04, 0x01, 0x01,
8059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x10, 0x06, 0x01, 0x01
8069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    );
8079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char vredalpha2 = (vector unsigned char)(
8089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
8099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    );
8109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    /*
8119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x00 - 0x0f is ARxx ARxx ARxx ARxx
8129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x11 - 0x0f odds are blue
8139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    */
8149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char vblue1 = VECUINT8_LITERAL(
8159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x00, 0x01, 0x02, 0x11,
8169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x04, 0x05, 0x06, 0x13,
8179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x08, 0x09, 0x0a, 0x15,
8189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x0c, 0x0d, 0x0e, 0x17
8199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    );
8209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char vblue2 = (vector unsigned char)(
8219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        vec_add((vector unsigned int)vblue1, v8_32)
8229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    );
8239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    /*
8249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x00 - 0x0f is ARxB ARxB ARxB ARxB
8259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x10 - 0x0e evens are green
8269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    */
8279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char vgreen1 = VECUINT8_LITERAL(
8289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x00, 0x01, 0x10, 0x03,
8299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x04, 0x05, 0x12, 0x07,
8309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x08, 0x09, 0x14, 0x0b,
8319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x0c, 0x0d, 0x16, 0x0f
8329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    );
8339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char vgreen2 = (vector unsigned char)(
8349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
8359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    );
8369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char vgmerge = VECUINT8_LITERAL(
8379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x00, 0x02, 0x00, 0x06,
8389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x00, 0x0a, 0x00, 0x0e,
8399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x00, 0x12, 0x00, 0x16,
8409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        0x00, 0x1a, 0x00, 0x1e);
8419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
8429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
8439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
8449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
8459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
8469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vf800 = vec_sl(vf800, vec_splat_u16(8));
8479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
8489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    while(height--) {
8499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        int extrawidth;
8509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        vector unsigned char valigner;
8519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        vector unsigned char vsrc;
8529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        vector unsigned char voverflow;
8539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        int width = info->d_width;
8549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
8559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#define ONE_PIXEL_BLEND(condition, widthvar) \
8569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        while (condition) { \
8579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            Uint32 Pixel; \
8589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            unsigned sR, sG, sB, dR, dG, dB, sA; \
8599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
8609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            if(sA) { \
8619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                unsigned short dstpixel = *((unsigned short *)dst); \
8629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                dR = (dstpixel >> 8) & 0xf8; \
8639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                dG = (dstpixel >> 3) & 0xfc; \
8649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                dB = (dstpixel << 3) & 0xf8; \
8659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
8669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                *((unsigned short *)dst) = ( \
8679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                    ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
8689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                ); \
8699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            } \
8709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            src += 4; \
8719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            dst += 2; \
8729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            widthvar--; \
8739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        }
8749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
8759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        extrawidth = (width % 8);
8769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        valigner = VEC_ALIGNER(src);
8779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        vsrc = (vector unsigned char)vec_ld(0, src);
8789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        width -= extrawidth;
8799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        while (width) {
8809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vector unsigned char valpha;
8819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vector unsigned char vsrc1, vsrc2;
8829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vector unsigned char vdst1, vdst2;
8839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vector unsigned short vR, vG, vB;
8849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
8859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
8869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            /* Load 8 pixels from src as ARGB */
8879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            voverflow = (vector unsigned char)vec_ld(15, src);
8889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vsrc = vec_perm(vsrc, voverflow, valigner);
8899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vsrc1 = vec_perm(vsrc, vsrc, vpermute);
8909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            src += 16;
8919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vsrc = (vector unsigned char)vec_ld(15, src);
8929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            voverflow = vec_perm(voverflow, vsrc, valigner);
8939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vsrc2 = vec_perm(voverflow, voverflow, vpermute);
8949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            src += 16;
8959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
8969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            /* Load 8 pixels from dst as XRGB */
8979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            voverflow = vec_ld(0, dst);
8989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vR = vec_and((vector unsigned short)voverflow, vf800);
8999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vB = vec_sl((vector unsigned short)voverflow, v3_16);
9009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vG = vec_sl(vB, v2_16);
9019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
9029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
9039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
9049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
9059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
9069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
9079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
9089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            /* Alpha blend 8 pixels as ARGB */
9099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            valpha = vec_perm(vsrc1, v0, valphaPermute);
9109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
9119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            valpha = vec_perm(vsrc2, v0, valphaPermute);
9129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
9139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
9149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            /* Convert 8 pixels to 565 */
9159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
9169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
9179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vgpixel = vec_and(vgpixel, vfc);
9189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vgpixel = vec_sl(vgpixel, v3_16);
9199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vrpixel = vec_sl(vpixel, v1_16);
9209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vrpixel = vec_and(vrpixel, vf800);
9219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vbpixel = vec_and(vpixel, v3f);
9229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
9239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
9249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
9259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            /* Store 8 pixels */
9269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vec_st(vdst1, 0, dst);
9279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
9289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            width -= 8;
9299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            dst += 16;
9309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        }
9319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        ONE_PIXEL_BLEND((extrawidth), extrawidth);
9329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#undef ONE_PIXEL_BLEND
9339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        src += srcskip;
9349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        dst += dstskip;
9359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    }
9369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
9379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
9389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
9399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
9409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    unsigned alpha = info->src->alpha;
9419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    int height = info->d_height;
9429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    Uint32 *srcp = (Uint32 *)info->s_pixels;
9439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    int srcskip = info->s_skip >> 2;
9449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    Uint32 *dstp = (Uint32 *)info->d_pixels;
9459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    int dstskip = info->d_skip >> 2;
9469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    SDL_PixelFormat *srcfmt = info->src;
9479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    SDL_PixelFormat *dstfmt = info->dst;
9489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    unsigned sA = srcfmt->alpha;
9499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
9509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
9519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    Uint32 ckey = info->src->colorkey;
9529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char mergePermute;
9539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char vsrcPermute;
9549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char vdstPermute;
9559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char vsdstPermute;
9569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char valpha;
9579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char valphamask;
9589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char vbits;
9599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char v0;
9609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned short v1;
9619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned short v8;
9629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned int vckey;
9639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned int vrgbmask;
9649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
9659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    mergePermute = VEC_MERGE_PERMUTE();
9669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    v0 = vec_splat_u8(0);
9679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    v1 = vec_splat_u16(1);
9689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    v8 = vec_splat_u16(8);
9699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
9709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    /* set the alpha to 255 on the destination surf */
9719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    valphamask = VEC_ALPHA_MASK();
9729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
9739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vsrcPermute = calc_swizzle32(srcfmt, NULL);
9749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vdstPermute = calc_swizzle32(NULL, dstfmt);
9759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vsdstPermute = calc_swizzle32(dstfmt, NULL);
9769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
9779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    /* set a vector full of alpha and 255-alpha */
9789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    ((unsigned char *)&valpha)[0] = alpha;
9799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    valpha = vec_splat(valpha, 0);
9809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vbits = (vector unsigned char)vec_splat_s8(-1);
9819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
9829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    ckey &= rgbmask;
9839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    ((unsigned int *)(char*)&vckey)[0] = ckey;
9849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vckey = vec_splat(vckey, 0);
9859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
9869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vrgbmask = vec_splat(vrgbmask, 0);
9879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
9889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    while(height--) {
9899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        int width = info->d_width;
9909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#define ONE_PIXEL_BLEND(condition, widthvar) \
9919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        while (condition) { \
9929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            Uint32 Pixel; \
9939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            unsigned sR, sG, sB, dR, dG, dB; \
9949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
9959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            if(sA && Pixel != ckey) { \
9969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
9979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
9989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
9999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
10009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            } \
10019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            dstp++; \
10029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            srcp++; \
10039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            widthvar--; \
10049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        }
10059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
10069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        if (width > 0) {
10079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            int extrawidth = (width % 4);
10089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vector unsigned char valigner = VEC_ALIGNER(srcp);
10099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
10109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            width -= extrawidth;
10119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            while (width) {
10129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vector unsigned char vsel;
10139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vector unsigned char voverflow;
10149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vector unsigned char vd;
10159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vector unsigned char vd_orig;
10169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
10179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* s = *srcp */
10189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                voverflow = (vector unsigned char)vec_ld(15, srcp);
10199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vs = vec_perm(vs, voverflow, valigner);
10209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
10219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* vsel is set for items that match the key */
10229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
10239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
10249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
10259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* permute to source format */
10269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vs = vec_perm(vs, valpha, vsrcPermute);
10279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
10289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* d = *dstp */
10299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vd = (vector unsigned char)vec_ld(0, dstp);
10309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
10319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
10329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
10339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
10349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* set the alpha channel to full on */
10359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vd = vec_or(vd, valphamask);
10369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
10379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* mask out color key */
10389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vd = vec_sel(vd, vd_orig, vsel);
10399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
10409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* permute to dest format */
10419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vd = vec_perm(vd, vbits, vdstPermute);
10429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
10439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* *dstp = res */
10449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vec_st((vector unsigned int)vd, 0, dstp);
10459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
10469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                srcp += 4;
10479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                dstp += 4;
10489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                width -= 4;
10499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vs = voverflow;
10509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            }
10519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            ONE_PIXEL_BLEND((extrawidth), extrawidth);
10529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        }
10539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#undef ONE_PIXEL_BLEND
10549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
10559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        srcp += srcskip;
10569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        dstp += dstskip;
10579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    }
10589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
10599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
10609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
10619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
10629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
10639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    int width = info->d_width;
10649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    int height = info->d_height;
10659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    Uint32 *srcp = (Uint32 *)info->s_pixels;
10669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    int srcskip = info->s_skip >> 2;
10679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    Uint32 *dstp = (Uint32 *)info->d_pixels;
10689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    int dstskip = info->d_skip >> 2;
10699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    SDL_PixelFormat *srcfmt = info->src;
10709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    SDL_PixelFormat *dstfmt = info->dst;
10719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char mergePermute;
10729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char valphaPermute;
10739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char vsrcPermute;
10749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char vdstPermute;
10759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char vsdstPermute;
10769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char valphamask;
10779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char vpixelmask;
10789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char v0;
10799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned short v1;
10809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned short v8;
10819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
10829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    v0 = vec_splat_u8(0);
10839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    v1 = vec_splat_u16(1);
10849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    v8 = vec_splat_u16(8);
10859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    mergePermute = VEC_MERGE_PERMUTE();
10869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    valphamask = VEC_ALPHA_MASK();
10879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
10889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vpixelmask = vec_nor(valphamask, v0);
10899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vsrcPermute = calc_swizzle32(srcfmt, NULL);
10909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vdstPermute = calc_swizzle32(NULL, dstfmt);
10919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vsdstPermute = calc_swizzle32(dstfmt, NULL);
10929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
10939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	while ( height-- ) {
10949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        width = info->d_width;
10959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
10969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            Uint32 Pixel; \
10979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
10989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
10999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            if(sA) { \
11009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall              DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
11019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall              ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
11029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall              ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
11039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            } \
11049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            ++srcp; \
11059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            ++dstp; \
11069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            widthvar--; \
11079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        }
11089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
11099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        if (width > 0) {
11109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            /* vsrcPermute */
11119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            /* vdstPermute */
11129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            int extrawidth = (width % 4);
11139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vector unsigned char valigner = VEC_ALIGNER(srcp);
11149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
11159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            width -= extrawidth;
11169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            while (width) {
11179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vector unsigned char voverflow;
11189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vector unsigned char vd;
11199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vector unsigned char valpha;
11209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vector unsigned char vdstalpha;
11219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* s = *srcp */
11229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                voverflow = (vector unsigned char)vec_ld(15, srcp);
11239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vs = vec_perm(vs, voverflow, valigner);
11249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vs = vec_perm(vs, v0, vsrcPermute);
11259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
11269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                valpha = vec_perm(vs, v0, valphaPermute);
11279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
11289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* d = *dstp */
11299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vd = (vector unsigned char)vec_ld(0, dstp);
11309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vd = vec_perm(vd, v0, vsdstPermute);
11319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vdstalpha = vec_and(vd, valphamask);
11329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
11339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
11349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
11359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* set the alpha to the dest alpha */
11369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vd = vec_and(vd, vpixelmask);
11379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vd = vec_or(vd, vdstalpha);
11389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vd = vec_perm(vd, v0, vdstPermute);
11399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
11409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* *dstp = res */
11419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vec_st((vector unsigned int)vd, 0, dstp);
11429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
11439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                srcp += 4;
11449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                dstp += 4;
11459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                width -= 4;
11469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vs = voverflow;
11479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
11489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            }
11499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            ONE_PIXEL_BLEND((extrawidth), extrawidth);
11509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        }
11519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    srcp += srcskip;
11529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    dstp += dstskip;
11539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#undef ONE_PIXEL_BLEND
11549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
11559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
11569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
11579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* fast ARGB888->(A)RGB888 blending with pixel alpha */
11589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
11599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
11609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int width = info->d_width;
11619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int height = info->d_height;
11629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 *srcp = (Uint32 *)info->s_pixels;
11639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int srcskip = info->s_skip >> 2;
11649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 *dstp = (Uint32 *)info->d_pixels;
11659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int dstskip = info->d_skip >> 2;
11669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char mergePermute;
11679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char valphaPermute;
11689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char valphamask;
11699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char vpixelmask;
11709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char v0;
11719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned short v1;
11729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned short v8;
11739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    v0 = vec_splat_u8(0);
11749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    v1 = vec_splat_u16(1);
11759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    v8 = vec_splat_u16(8);
11769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    mergePermute = VEC_MERGE_PERMUTE();
11779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    valphamask = VEC_ALPHA_MASK();
11789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
11799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
11809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
11819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vpixelmask = vec_nor(valphamask, v0);
11829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	while(height--) {
11839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        width = info->d_width;
11849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#define ONE_PIXEL_BLEND(condition, widthvar) \
11859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        while ((condition)) { \
11869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            Uint32 dalpha; \
11879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            Uint32 d; \
11889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            Uint32 s1; \
11899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            Uint32 d1; \
11909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            Uint32 s = *srcp; \
11919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            Uint32 alpha = s >> 24; \
11929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            if(alpha) { \
11939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall              if(alpha == SDL_ALPHA_OPAQUE) { \
11949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
11959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall              } else { \
11969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                d = *dstp; \
11979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                dalpha = d & 0xff000000; \
11989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                s1 = s & 0xff00ff; \
11999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                d1 = d & 0xff00ff; \
12009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
12019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                s &= 0xff00; \
12029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                d &= 0xff00; \
12039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
12049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                *dstp = d1 | d | dalpha; \
12059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall              } \
12069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            } \
12079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            ++srcp; \
12089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            ++dstp; \
12099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            widthvar--; \
12109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    }
12119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
12129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        if (width > 0) {
12139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            int extrawidth = (width % 4);
12149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vector unsigned char valigner = VEC_ALIGNER(srcp);
12159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
12169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            width -= extrawidth;
12179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            while (width) {
12189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vector unsigned char voverflow;
12199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vector unsigned char vd;
12209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vector unsigned char valpha;
12219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vector unsigned char vdstalpha;
12229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* s = *srcp */
12239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                voverflow = (vector unsigned char)vec_ld(15, srcp);
12249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vs = vec_perm(vs, voverflow, valigner);
12259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
12269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                valpha = vec_perm(vs, v0, valphaPermute);
12279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
12289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* d = *dstp */
12299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vd = (vector unsigned char)vec_ld(0, dstp);
12309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vdstalpha = vec_and(vd, valphamask);
12319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
12329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
12339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
12349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* set the alpha to the dest alpha */
12359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vd = vec_and(vd, vpixelmask);
12369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vd = vec_or(vd, vdstalpha);
12379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
12389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* *dstp = res */
12399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vec_st((vector unsigned int)vd, 0, dstp);
12409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
12419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                srcp += 4;
12429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                dstp += 4;
12439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                width -= 4;
12449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vs = voverflow;
12459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            }
12469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            ONE_PIXEL_BLEND((extrawidth), extrawidth);
12479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        }
12489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    srcp += srcskip;
12499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    dstp += dstskip;
12509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
12519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#undef ONE_PIXEL_BLEND
12529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
12539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
12549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
12559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
12569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    /* XXX : 6 */
12579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	unsigned alpha = info->src->alpha;
12589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    int height = info->d_height;
12599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    Uint32 *srcp = (Uint32 *)info->s_pixels;
12609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    int srcskip = info->s_skip >> 2;
12619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    Uint32 *dstp = (Uint32 *)info->d_pixels;
12629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    int dstskip = info->d_skip >> 2;
12639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    SDL_PixelFormat *srcfmt = info->src;
12649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    SDL_PixelFormat *dstfmt = info->dst;
12659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	unsigned sA = srcfmt->alpha;
12669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
12679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char mergePermute;
12689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char vsrcPermute;
12699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char vdstPermute;
12709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char vsdstPermute;
12719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char valpha;
12729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char valphamask;
12739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char vbits;
12749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned short v1;
12759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned short v8;
12769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
12779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    mergePermute = VEC_MERGE_PERMUTE();
12789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    v1 = vec_splat_u16(1);
12799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    v8 = vec_splat_u16(8);
12809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
12819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    /* set the alpha to 255 on the destination surf */
12829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    valphamask = VEC_ALPHA_MASK();
12839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
12849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vsrcPermute = calc_swizzle32(srcfmt, NULL);
12859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vdstPermute = calc_swizzle32(NULL, dstfmt);
12869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vsdstPermute = calc_swizzle32(dstfmt, NULL);
12879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
12889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    /* set a vector full of alpha and 255-alpha */
12899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    ((unsigned char *)&valpha)[0] = alpha;
12909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    valpha = vec_splat(valpha, 0);
12919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vbits = (vector unsigned char)vec_splat_s8(-1);
12929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
12939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    while(height--) {
12949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        int width = info->d_width;
12959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
12969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            Uint32 Pixel; \
12979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            unsigned sR, sG, sB, dR, dG, dB; \
12989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
12999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
13009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
13019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
13029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            ++srcp; \
13039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            ++dstp; \
13049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            widthvar--; \
13059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        }
13069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
13079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        if (width > 0) {
13089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            int extrawidth = (width % 4);
13099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vector unsigned char valigner = VEC_ALIGNER(srcp);
13109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
13119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            width -= extrawidth;
13129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            while (width) {
13139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vector unsigned char voverflow;
13149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vector unsigned char vd;
13159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
13169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* s = *srcp */
13179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                voverflow = (vector unsigned char)vec_ld(15, srcp);
13189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vs = vec_perm(vs, voverflow, valigner);
13199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vs = vec_perm(vs, valpha, vsrcPermute);
13209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
13219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* d = *dstp */
13229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vd = (vector unsigned char)vec_ld(0, dstp);
13239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vd = vec_perm(vd, vd, vsdstPermute);
13249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
13259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
13269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
13279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* set the alpha channel to full on */
13289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vd = vec_or(vd, valphamask);
13299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vd = vec_perm(vd, vbits, vdstPermute);
13309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
13319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* *dstp = res */
13329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vec_st((vector unsigned int)vd, 0, dstp);
13339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
13349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                srcp += 4;
13359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                dstp += 4;
13369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                width -= 4;
13379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vs = voverflow;
13389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            }
13399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            ONE_PIXEL_BLEND((extrawidth), extrawidth);
13409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        }
13419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#undef ONE_PIXEL_BLEND
13429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
13439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        srcp += srcskip;
13449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        dstp += dstskip;
13459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    }
13469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
13479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
13489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
13499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
13509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* fast RGB888->(A)RGB888 blending */
13519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
13529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
13539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	unsigned alpha = info->src->alpha;
13549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    int height = info->d_height;
13559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    Uint32 *srcp = (Uint32 *)info->s_pixels;
13569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    int srcskip = info->s_skip >> 2;
13579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    Uint32 *dstp = (Uint32 *)info->d_pixels;
13589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    int dstskip = info->d_skip >> 2;
13599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char mergePermute;
13609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char valpha;
13619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned char valphamask;
13629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned short v1;
13639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    vector unsigned short v8;
13649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
13659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    mergePermute = VEC_MERGE_PERMUTE();
13669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    v1 = vec_splat_u16(1);
13679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    v8 = vec_splat_u16(8);
13689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
13699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    /* set the alpha to 255 on the destination surf */
13709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    valphamask = VEC_ALPHA_MASK();
13719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
13729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    /* set a vector full of alpha and 255-alpha */
13739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    ((unsigned char *)&valpha)[0] = alpha;
13749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    valpha = vec_splat(valpha, 0);
13759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
13769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    while(height--) {
13779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        int width = info->d_width;
13789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
13799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            Uint32 s = *srcp; \
13809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            Uint32 d = *dstp; \
13819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            Uint32 s1 = s & 0xff00ff; \
13829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            Uint32 d1 = d & 0xff00ff; \
13839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
13849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                 & 0xff00ff; \
13859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            s &= 0xff00; \
13869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            d &= 0xff00; \
13879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
13889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            *dstp = d1 | d | 0xff000000; \
13899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            ++srcp; \
13909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            ++dstp; \
13919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            widthvar--; \
13929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        }
13939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
13949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        if (width > 0) {
13959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            int extrawidth = (width % 4);
13969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vector unsigned char valigner = VEC_ALIGNER(srcp);
13979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
13989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            width -= extrawidth;
13999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            while (width) {
14009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vector unsigned char voverflow;
14019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vector unsigned char vd;
14029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
14039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* s = *srcp */
14049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                voverflow = (vector unsigned char)vec_ld(15, srcp);
14059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vs = vec_perm(vs, voverflow, valigner);
14069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
14079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* d = *dstp */
14089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vd = (vector unsigned char)vec_ld(0, dstp);
14099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
14109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
14119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
14129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* set the alpha channel to full on */
14139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vd = vec_or(vd, valphamask);
14149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
14159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                /* *dstp = res */
14169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vec_st((vector unsigned int)vd, 0, dstp);
14179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
14189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                srcp += 4;
14199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                dstp += 4;
14209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                width -= 4;
14219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                vs = voverflow;
14229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            }
14239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            ONE_PIXEL_BLEND((extrawidth), extrawidth);
14249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        }
14259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#undef ONE_PIXEL_BLEND
14269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
14279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        srcp += srcskip;
14289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        dstp += dstskip;
14299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    }
14309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
14319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#if __MWERKS__
14329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#pragma altivec_model off
14339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#endif
14349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#endif /* SDL_ALTIVEC_BLITTERS */
14359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
14369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
14379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
14389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
14399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int width = info->d_width;
14409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int height = info->d_height;
14419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 *srcp = (Uint32 *)info->s_pixels;
14429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int srcskip = info->s_skip >> 2;
14439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 *dstp = (Uint32 *)info->d_pixels;
14449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int dstskip = info->d_skip >> 2;
14459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
14469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	while(height--) {
14479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    DUFFS_LOOP4({
14489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    Uint32 s = *srcp++;
14499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    Uint32 d = *dstp;
14509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
14519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			       + (s & d & 0x00010101)) | 0xff000000;
14529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    }, width);
14539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    srcp += srcskip;
14549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    dstp += dstskip;
14559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
14569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
14579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
14589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* fast RGB888->(A)RGB888 blending with surface alpha */
14599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
14609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
14619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	unsigned alpha = info->src->alpha;
14629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	if(alpha == 128) {
14639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		BlitRGBtoRGBSurfaceAlpha128(info);
14649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	} else {
14659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int width = info->d_width;
14669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int height = info->d_height;
14679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 *srcp = (Uint32 *)info->s_pixels;
14689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int srcskip = info->s_skip >> 2;
14699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 *dstp = (Uint32 *)info->d_pixels;
14709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int dstskip = info->d_skip >> 2;
14719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 s;
14729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 d;
14739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 s1;
14749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 d1;
14759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
14769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		while(height--) {
14779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			DUFFS_LOOP_DOUBLE2({
14789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* One Pixel Blend */
14799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = *srcp;
14809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = *dstp;
14819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s1 = s & 0xff00ff;
14829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d1 = d & 0xff00ff;
14839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d1 = (d1 + ((s1 - d1) * alpha >> 8))
14849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				     & 0xff00ff;
14859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s &= 0xff00;
14869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d &= 0xff00;
14879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
14889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*dstp = d1 | d | 0xff000000;
14899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				++srcp;
14909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				++dstp;
14919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			},{
14929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			        /* Two Pixels Blend */
14939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = *srcp;
14949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = *dstp;
14959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s1 = s & 0xff00ff;
14969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d1 = d & 0xff00ff;
14979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d1 += (s1 - d1) * alpha >> 8;
14989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d1 &= 0xff00ff;
14999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
15009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = ((s & 0xff00) >> 8) |
15019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall					((srcp[1] & 0xff00) << 8);
15029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = ((d & 0xff00) >> 8) |
15039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall					((dstp[1] & 0xff00) << 8);
15049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d += (s - d) * alpha >> 8;
15059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d &= 0x00ff00ff;
15069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
15079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
15089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				++srcp;
15099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
15109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			        s1 = *srcp;
15119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d1 = *dstp;
15129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s1 &= 0xff00ff;
15139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d1 &= 0xff00ff;
15149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d1 += (s1 - d1) * alpha >> 8;
15159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d1 &= 0xff00ff;
15169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
15179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
15189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				++srcp;
15199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				++dstp;
15209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			}, width);
15219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			srcp += srcskip;
15229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dstp += dstskip;
15239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
15249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
15259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
15269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
15279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* fast ARGB888->(A)RGB888 blending with pixel alpha */
15289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
15299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
15309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int width = info->d_width;
15319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int height = info->d_height;
15329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 *srcp = (Uint32 *)info->s_pixels;
15339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int srcskip = info->s_skip >> 2;
15349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 *dstp = (Uint32 *)info->d_pixels;
15359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int dstskip = info->d_skip >> 2;
15369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
15379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	while(height--) {
15389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    DUFFS_LOOP4({
15399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 dalpha;
15409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 d;
15419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 s1;
15429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 d1;
15439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 s = *srcp;
15449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 alpha = s >> 24;
15459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		/* FIXME: Here we special-case opaque alpha since the
15469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   compositioning used (>>8 instead of /255) doesn't handle
15479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   it correctly. Also special-case alpha=0 for speed?
15489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   Benchmark this! */
15499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		if(alpha) {
15509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		  if(alpha == SDL_ALPHA_OPAQUE) {
15519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
15529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		  } else {
15539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    /*
15549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		     * take out the middle component (green), and process
15559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		     * the other two in parallel. One multiply less.
15569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		     */
15579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    d = *dstp;
15589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    dalpha = d & 0xff000000;
15599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    s1 = s & 0xff00ff;
15609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    d1 = d & 0xff00ff;
15619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
15629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    s &= 0xff00;
15639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    d &= 0xff00;
15649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
15659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    *dstp = d1 | d | dalpha;
15669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		  }
15679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
15689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		++srcp;
15699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		++dstp;
15709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    }, width);
15719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    srcp += srcskip;
15729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    dstp += dstskip;
15739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
15749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
15759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
15769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#if GCC_ASMBLIT
15779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
15789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
15799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
15809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int width = info->d_width;
15819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int height = info->d_height;
15829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 *srcp = (Uint32 *)info->s_pixels;
15839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int srcskip = info->s_skip >> 2;
15849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 *dstp = (Uint32 *)info->d_pixels;
15859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int dstskip = info->d_skip >> 2;
15869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	SDL_PixelFormat* sf = info->src;
15879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 amask = sf->Amask;
15889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
15899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	__asm__ (
15909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	/* make mm6 all zeros. */
15919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	"pxor       %%mm6, %%mm6\n"
15929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
15939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	/* Make a mask to preserve the alpha. */
15949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	"movd      %0, %%mm7\n\t"           /* 0000F000 -> mm7 */
15959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	"punpcklbw %%mm7, %%mm7\n\t"        /* FF000000 -> mm7 */
15969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	"pcmpeqb   %%mm4, %%mm4\n\t"        /* FFFFFFFF -> mm4 */
15979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	"movq      %%mm4, %%mm3\n\t"        /* FFFFFFFF -> mm3 (for later) */
15989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	"pxor      %%mm4, %%mm7\n\t"        /* 00FFFFFF -> mm7 (mult mask) */
15999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
16009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	/* form channel masks */
16019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	"movq      %%mm7, %%mm4\n\t"        /* 00FFFFFF -> mm4 */
16029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	"packsswb  %%mm6, %%mm4\n\t"        /* 00000FFF -> mm4 (channel mask) */
16039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	"packsswb  %%mm6, %%mm3\n\t"        /* 0000FFFF -> mm3 */
16049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	"pxor      %%mm4, %%mm3\n\t"        /* 0000F000 -> mm3 (~channel mask) */
16059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
16069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	/* get alpha channel shift */
16079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	"movd      %1, %%mm5\n\t" /* Ashift -> mm5 */
16089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
16099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	  : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
16109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
16119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	while(height--) {
16129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
16139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    DUFFS_LOOP4({
16149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 alpha;
16159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
16169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		__asm__ (
16179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		"prefetch 64(%0)\n"
16189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		"prefetch 64(%1)\n"
16199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			: : "r" (srcp), "r" (dstp) );
16209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
16219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		alpha = *srcp & amask;
16229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		/* FIXME: Here we special-case opaque alpha since the
16239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   compositioning used (>>8 instead of /255) doesn't handle
16249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   it correctly. Also special-case alpha=0 for speed?
16259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   Benchmark this! */
16269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		if(alpha == 0) {
16279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    /* do nothing */
16289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
16299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		else if(alpha == amask) {
16309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			/* opaque alpha -- copy RGB, keep dst alpha */
16319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    /* using MMX here to free up regular registers for other things */
16329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			    __asm__ (
16339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
16349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
16359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
16369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
16379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
16389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
16399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
16409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		     : : "r" (srcp), "r" (dstp) );
16419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
16429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
16439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		else {
16449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			    __asm__ (
16459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    /* load in the source, and dst. */
16469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    "movd      (%0), %%mm0\n"		    /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
16479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    "movd      (%1), %%mm1\n"		    /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
16489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
16499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    /* Move the src alpha into mm2 */
16509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
16519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    /* if supporting pshufw */
16529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
16539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    /*"psrlw     $8, %%mm2\n" */
16549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
16559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    /* else: */
16569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    "movd       %2,    %%mm2\n"
16579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
16589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    "punpcklwd	%%mm2, %%mm2\n"	            /* mm2 = 0 0 0 0 |  0 As  0  As */
16599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    "punpckldq	%%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
16609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
16619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
16629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    /* move the colors into words. */
16639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    "punpcklbw %%mm6, %%mm0\n"		    /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
16649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
16659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
16669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    /* src - dst */
16679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    "psubw    %%mm1, %%mm0\n"		    /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
16689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
16699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    /* A * (src-dst) */
16709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    "pmullw    %%mm2, %%mm0\n"		    /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
16719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    "psrlw     $8,    %%mm0\n"		    /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
16729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    "paddb     %%mm1, %%mm0\n"		    /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
16739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
16749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
16759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
16769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    "movd      %%mm0, (%1)\n"               /* result in mm0 */
16779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
16789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		     : : "r" (srcp), "r" (dstp), "r" (alpha) );
16799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
16809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
16819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		++srcp;
16829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		++dstp;
16839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    }, width);
16849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    srcp += srcskip;
16859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    dstp += dstskip;
16869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
16879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
16889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	__asm__ (
16899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	"emms\n"
16909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		:   );
16919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
16929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* End GCC_ASMBLIT*/
16939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
16949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#elif MSVC_ASMBLIT
16959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
16969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
16979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
16989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int width = info->d_width;
16999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int height = info->d_height;
17009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 *srcp = (Uint32 *)info->s_pixels;
17019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int srcskip = info->s_skip >> 2;
17029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 *dstp = (Uint32 *)info->d_pixels;
17039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int dstskip = info->d_skip >> 2;
17049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	SDL_PixelFormat* sf = info->src;
17059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
17069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 amask = sf->Amask;
17079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 ashift = sf->Ashift;
17089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint64 multmask;
17099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
17109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
17119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
17129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
17139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	multmask = ~(0xFFFFi64 << (ashift * 2));
17149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
17159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
17169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	while(height--) {
17179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    DUFFS_LOOP4({
17189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 alpha;
17199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
17209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		_m_prefetch(srcp + 16);
17219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		_m_prefetch(dstp + 16);
17229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
17239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		alpha = *srcp & amask;
17249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		if (alpha == 0) {
17259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			/* do nothing */
17269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		} else if (alpha == amask) {
17279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			/* copy RGB, keep dst alpha */
17289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
17299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		} else {
17309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
17319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
17329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
17339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
17349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
17359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
17369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
17379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
17389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
17399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
17409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
17419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
17429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			/* blend */
17439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
17449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
17459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
17469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
17479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
17489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
17499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
17509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
17519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		++srcp;
17529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		++dstp;
17539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    }, width);
17549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    srcp += srcskip;
17559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    dstp += dstskip;
17569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
17579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	_mm_empty();
17589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
17599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* End MSVC_ASMBLIT */
17609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
17619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
17629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
17639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
17649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
17659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* blend a single 16 bit pixel at 50% */
17669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#define BLEND16_50(d, s, mask)						\
17679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
17689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
17699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* blend two 16 bit pixels at 50% */
17709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#define BLEND2x16_50(d, s, mask)					     \
17719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
17729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	 + (s & d & (~(mask | mask << 16))))
17739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
17749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
17759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
17769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int width = info->d_width;
17779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int height = info->d_height;
17789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint16 *srcp = (Uint16 *)info->s_pixels;
17799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int srcskip = info->s_skip >> 1;
17809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint16 *dstp = (Uint16 *)info->d_pixels;
17819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int dstskip = info->d_skip >> 1;
17829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
17839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	while(height--) {
17849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
17859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			/*
17869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			 * Source and destination not aligned, pipeline it.
17879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			 * This is mostly a win for big blits but no loss for
17889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			 * small ones
17899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			 */
17909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			Uint32 prev_sw;
17919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			int w = width;
17929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
17939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			/* handle odd destination */
17949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			if((uintptr_t)dstp & 2) {
17959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				Uint16 d = *dstp, s = *srcp;
17969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*dstp = BLEND16_50(d, s, mask);
17979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dstp++;
17989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				srcp++;
17999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				w--;
18009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			}
18019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			srcp++;	/* srcp is now 32-bit aligned */
18029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
18039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			/* bootstrap pipeline with first halfword */
18049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			prev_sw = ((Uint32 *)srcp)[-1];
18059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
18069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			while(w > 1) {
18079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				Uint32 sw, dw, s;
18089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				sw = *(Uint32 *)srcp;
18099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dw = *(Uint32 *)dstp;
18109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#if SDL_BYTEORDER == SDL_BIG_ENDIAN
18119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = (prev_sw << 16) + (sw >> 16);
18129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#else
18139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = (prev_sw >> 16) + (sw << 16);
18149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#endif
18159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				prev_sw = sw;
18169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
18179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dstp += 2;
18189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				srcp += 2;
18199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				w -= 2;
18209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			}
18219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
18229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			/* final pixel if any */
18239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			if(w) {
18249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				Uint16 d = *dstp, s;
18259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#if SDL_BYTEORDER == SDL_BIG_ENDIAN
18269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = (Uint16)prev_sw;
18279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#else
18289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = (Uint16)(prev_sw >> 16);
18299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#endif
18309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*dstp = BLEND16_50(d, s, mask);
18319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				srcp++;
18329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dstp++;
18339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			}
18349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			srcp += srcskip - 1;
18359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dstp += dstskip;
18369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		} else {
18379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			/* source and destination are aligned */
18389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			int w = width;
18399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
18409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			/* first odd pixel? */
18419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			if((uintptr_t)srcp & 2) {
18429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				Uint16 d = *dstp, s = *srcp;
18439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*dstp = BLEND16_50(d, s, mask);
18449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				srcp++;
18459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dstp++;
18469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				w--;
18479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			}
18489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			/* srcp and dstp are now 32-bit aligned */
18499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
18509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			while(w > 1) {
18519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				Uint32 sw = *(Uint32 *)srcp;
18529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				Uint32 dw = *(Uint32 *)dstp;
18539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
18549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				srcp += 2;
18559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dstp += 2;
18569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				w -= 2;
18579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			}
18589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
18599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			/* last odd pixel? */
18609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			if(w) {
18619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				Uint16 d = *dstp, s = *srcp;
18629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*dstp = BLEND16_50(d, s, mask);
18639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				srcp++;
18649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dstp++;
18659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			}
18669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			srcp += srcskip;
18679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dstp += dstskip;
18689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
18699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
18709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
18719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
18729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#if GCC_ASMBLIT
18739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* fast RGB565->RGB565 blending with surface alpha */
18749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
18759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
18769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
18779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	if(alpha == 128) {
18789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Blit16to16SurfaceAlpha128(info, 0xf7de);
18799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	} else {
18809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int width = info->d_width;
18819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int height = info->d_height;
18829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint16 *srcp = (Uint16 *)info->s_pixels;
18839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int srcskip = info->s_skip >> 1;
18849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint16 *dstp = (Uint16 *)info->d_pixels;
18859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int dstskip = info->d_skip >> 1;
18869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 s, d;
18879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint64 load;
18889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
18899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
18909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		load = alpha;
18919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		alpha >>= 3;		/* downscale alpha to 5 bits */
18929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
18939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
18949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
18959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
18969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		/* position alpha to allow for mullo and mulhi on diff channels
18979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   to reduce the number of operations */
18989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		psllq_i2r(3, mm0);
18999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
19009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		/* Setup the 565 color channel masks */
19019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		load = 0x07E007E007E007E0ULL;
19029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
19039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		load = 0x001F001F001F001FULL;
19049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
19059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		while(height--) {
19069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			DUFFS_LOOP_QUATRO2(
19079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			{
19089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = *srcp++;
19099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = *dstp;
19109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/*
19119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * shift out the middle component (green) to
19129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * the high 16 bits, and process all three RGB
19139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * components at the same time.
19149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 */
19159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = (s | s << 16) & 0x07e0f81f;
19169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = (d | d << 16) & 0x07e0f81f;
19179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d += (s - d) * alpha >> 5;
19189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d &= 0x07e0f81f;
19199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*dstp++ = d | d >> 16;
19209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			},{
19219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = *srcp++;
19229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = *dstp;
19239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/*
19249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * shift out the middle component (green) to
19259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * the high 16 bits, and process all three RGB
19269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * components at the same time.
19279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 */
19289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = (s | s << 16) & 0x07e0f81f;
19299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = (d | d << 16) & 0x07e0f81f;
19309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d += (s - d) * alpha >> 5;
19319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d &= 0x07e0f81f;
19329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*dstp++ = d | d >> 16;
19339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = *srcp++;
19349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = *dstp;
19359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/*
19369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * shift out the middle component (green) to
19379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * the high 16 bits, and process all three RGB
19389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * components at the same time.
19399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 */
19409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = (s | s << 16) & 0x07e0f81f;
19419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = (d | d << 16) & 0x07e0f81f;
19429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d += (s - d) * alpha >> 5;
19439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d &= 0x07e0f81f;
19449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*dstp++ = d | d >> 16;
19459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			},{
19469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
19479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
19489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
19499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* red -- does not need a mask since the right shift clears
19509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				   the uninteresting bits */
19519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_r2r(mm2, mm5); /* src -> mm5 */
19529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_r2r(mm3, mm6); /* dst -> mm6 */
19539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
19549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
19559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
19569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* blend */
19579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
19589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
19599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* alpha used is actually 11 bits
19609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				   11 + 5 = 16 bits, so the sign bits are lost */
19619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
19629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
19639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
19649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
19659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_r2r(mm6, mm1); /* save new reds in dsts */
19669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
19679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* green -- process the bits in place */
19689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_r2r(mm2, mm5); /* src -> mm5 */
19699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_r2r(mm3, mm6); /* dst -> mm6 */
19709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
19719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
19729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
19739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* blend */
19749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
19759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
19769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
19779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				   bits are gone and the sign bits present */
19789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
19799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
19809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
19819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				por_r2r(mm6, mm1); /* save new greens in dsts */
19829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
19839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* blue */
19849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_r2r(mm2, mm5); /* src -> mm5 */
19859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_r2r(mm3, mm6); /* dst -> mm6 */
19869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
19879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
19889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
19899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* blend */
19909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
19919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
19929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* 11 + 5 = 16 bits, so the sign bits are lost and
19939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				   the interesting bits will need to be MASKed */
19949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
19959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
19969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
19979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
19989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				por_r2r(mm6, mm1); /* save new blues in dsts */
19999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
20009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
20019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
20029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				srcp += 4;
20039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dstp += 4;
20049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			}, width);
20059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			srcp += srcskip;
20069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dstp += dstskip;
20079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
20089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		emms();
20099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
20109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
20119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
20129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* fast RGB555->RGB555 blending with surface alpha */
20139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
20149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
20159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
20169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	if(alpha == 128) {
20179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Blit16to16SurfaceAlpha128(info, 0xfbde);
20189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	} else {
20199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int width = info->d_width;
20209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int height = info->d_height;
20219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint16 *srcp = (Uint16 *)info->s_pixels;
20229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int srcskip = info->s_skip >> 1;
20239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint16 *dstp = (Uint16 *)info->d_pixels;
20249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int dstskip = info->d_skip >> 1;
20259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 s, d;
20269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint64 load;
20279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
20289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
20299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		load = alpha;
20309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		alpha >>= 3;		/* downscale alpha to 5 bits */
20319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
20329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
20339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
20349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
20359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		/* position alpha to allow for mullo and mulhi on diff channels
20369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   to reduce the number of operations */
20379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		psllq_i2r(3, mm0);
20389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
20399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		/* Setup the 555 color channel masks */
20409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		load = 0x03E003E003E003E0ULL;
20419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
20429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		load = 0x001F001F001F001FULL;
20439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
20449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		while(height--) {
20459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			DUFFS_LOOP_QUATRO2(
20469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			{
20479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = *srcp++;
20489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = *dstp;
20499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/*
20509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * shift out the middle component (green) to
20519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * the high 16 bits, and process all three RGB
20529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * components at the same time.
20539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 */
20549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = (s | s << 16) & 0x03e07c1f;
20559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = (d | d << 16) & 0x03e07c1f;
20569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d += (s - d) * alpha >> 5;
20579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d &= 0x03e07c1f;
20589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*dstp++ = d | d >> 16;
20599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			},{
20609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = *srcp++;
20619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = *dstp;
20629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/*
20639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * shift out the middle component (green) to
20649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * the high 16 bits, and process all three RGB
20659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * components at the same time.
20669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 */
20679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = (s | s << 16) & 0x03e07c1f;
20689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = (d | d << 16) & 0x03e07c1f;
20699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d += (s - d) * alpha >> 5;
20709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d &= 0x03e07c1f;
20719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*dstp++ = d | d >> 16;
20729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			        s = *srcp++;
20739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = *dstp;
20749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/*
20759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * shift out the middle component (green) to
20769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * the high 16 bits, and process all three RGB
20779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * components at the same time.
20789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 */
20799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = (s | s << 16) & 0x03e07c1f;
20809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = (d | d << 16) & 0x03e07c1f;
20819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d += (s - d) * alpha >> 5;
20829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d &= 0x03e07c1f;
20839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*dstp++ = d | d >> 16;
20849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			},{
20859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
20869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
20879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
20889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* red -- process the bits in place */
20899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
20909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall					/* by reusing the GREEN mask we free up another mmx
20919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall					   register to accumulate the result */
20929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
20939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_r2r(mm2, mm5); /* src -> mm5 */
20949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_r2r(mm3, mm6); /* dst -> mm6 */
20959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
20969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
20979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
20989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* blend */
20999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
21009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
21019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* 11 + 15 - 16 = 10 bits, uninteresting bits will be
21029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				   cleared by a MASK below */
21039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
21049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
21059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
21069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
21079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
21089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
21099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_r2r(mm6, mm1); /* save new reds in dsts */
21109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
21119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* green -- process the bits in place */
21129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_r2r(mm2, mm5); /* src -> mm5 */
21139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_r2r(mm3, mm6); /* dst -> mm6 */
21149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
21159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
21169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
21179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* blend */
21189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
21199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
21209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
21219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				   bits are gone and the sign bits present */
21229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
21239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
21249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
21259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				por_r2r(mm6, mm1); /* save new greens in dsts */
21269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
21279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* blue */
21289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_r2r(mm2, mm5); /* src -> mm5 */
21299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_r2r(mm3, mm6); /* dst -> mm6 */
21309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
21319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
21329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
21339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* blend */
21349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
21359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
21369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* 11 + 5 = 16 bits, so the sign bits are lost and
21379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				   the interesting bits will need to be MASKed */
21389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
21399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
21409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
21419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
21429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				por_r2r(mm6, mm1); /* save new blues in dsts */
21439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
21449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
21459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
21469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				srcp += 4;
21479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dstp += 4;
21489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			}, width);
21499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			srcp += srcskip;
21509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dstp += dstskip;
21519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
21529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		emms();
21539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
21549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
21559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* End GCC_ASMBLIT */
21569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
21579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#elif MSVC_ASMBLIT
21589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* fast RGB565->RGB565 blending with surface alpha */
21599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
21609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
21619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	unsigned alpha = info->src->alpha;
21629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	if(alpha == 128) {
21639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Blit16to16SurfaceAlpha128(info, 0xf7de);
21649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	} else {
21659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int width = info->d_width;
21669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int height = info->d_height;
21679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint16 *srcp = (Uint16 *)info->s_pixels;
21689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int srcskip = info->s_skip >> 1;
21699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint16 *dstp = (Uint16 *)info->d_pixels;
21709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int dstskip = info->d_skip >> 1;
21719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 s, d;
21729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
21739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		__m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
21749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
21759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
21769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
21779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		alpha >>= 3;		/* downscale alpha to 5 bits */
21789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
21799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
21809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
21819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		/* position alpha to allow for mullo and mulhi on diff channels
21829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   to reduce the number of operations */
21839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		mm_alpha = _mm_slli_si64(mm_alpha, 3);
21849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
21859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		/* Setup the 565 color channel masks */
21869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
21879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
21889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
21899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		while(height--) {
21909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			DUFFS_LOOP_QUATRO2(
21919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			{
21929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = *srcp++;
21939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = *dstp;
21949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/*
21959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * shift out the middle component (green) to
21969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * the high 16 bits, and process all three RGB
21979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * components at the same time.
21989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 */
21999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = (s | s << 16) & 0x07e0f81f;
22009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = (d | d << 16) & 0x07e0f81f;
22019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d += (s - d) * alpha >> 5;
22029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d &= 0x07e0f81f;
22039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*dstp++ = (Uint16)(d | d >> 16);
22049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			},{
22059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = *srcp++;
22069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = *dstp;
22079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/*
22089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * shift out the middle component (green) to
22099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * the high 16 bits, and process all three RGB
22109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * components at the same time.
22119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 */
22129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = (s | s << 16) & 0x07e0f81f;
22139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = (d | d << 16) & 0x07e0f81f;
22149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d += (s - d) * alpha >> 5;
22159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d &= 0x07e0f81f;
22169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*dstp++ = (Uint16)(d | d >> 16);
22179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = *srcp++;
22189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = *dstp;
22199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/*
22209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * shift out the middle component (green) to
22219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * the high 16 bits, and process all three RGB
22229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * components at the same time.
22239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 */
22249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = (s | s << 16) & 0x07e0f81f;
22259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = (d | d << 16) & 0x07e0f81f;
22269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d += (s - d) * alpha >> 5;
22279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d &= 0x07e0f81f;
22289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*dstp++ = (Uint16)(d | d >> 16);
22299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			},{
22309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
22319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
22329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
22339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* red */
22349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = src1;
22359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
22369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
22379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = dst1;
22389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
22399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
22409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* blend */
22419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
22429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
22439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
22449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
22459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
22469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
22479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				mm_res = dst2; /* RED -> mm_res */
22489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
22499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* green -- process the bits in place */
22509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = src1;
22519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
22529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
22539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = dst1;
22549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
22559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
22569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* blend */
22579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
22589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
22599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
22609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
22619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
22629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
22639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
22649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* blue */
22659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = src1;
22669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
22679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
22689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = dst1;
22699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
22709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
22719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* blend */
22729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
22739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
22749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
22759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
22769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
22779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
22789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
22799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
22809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
22819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
22829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				srcp += 4;
22839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dstp += 4;
22849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			}, width);
22859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			srcp += srcskip;
22869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dstp += dstskip;
22879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
22889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		_mm_empty();
22899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
22909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
22919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
22929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* fast RGB555->RGB555 blending with surface alpha */
22939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
22949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
22959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	unsigned alpha = info->src->alpha;
22969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	if(alpha == 128) {
22979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Blit16to16SurfaceAlpha128(info, 0xfbde);
22989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	} else {
22999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int width = info->d_width;
23009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int height = info->d_height;
23019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint16 *srcp = (Uint16 *)info->s_pixels;
23029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int srcskip = info->s_skip >> 1;
23039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint16 *dstp = (Uint16 *)info->d_pixels;
23049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int dstskip = info->d_skip >> 1;
23059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 s, d;
23069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
23079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		__m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
23089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
23099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
23109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
23119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		alpha >>= 3;		/* downscale alpha to 5 bits */
23129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
23139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
23149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
23159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		/* position alpha to allow for mullo and mulhi on diff channels
23169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   to reduce the number of operations */
23179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		mm_alpha = _mm_slli_si64(mm_alpha, 3);
23189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
23199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		/* Setup the 555 color channel masks */
23209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
23219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
23229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
23239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
23249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		while(height--) {
23259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			DUFFS_LOOP_QUATRO2(
23269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			{
23279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = *srcp++;
23289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = *dstp;
23299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/*
23309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * shift out the middle component (green) to
23319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * the high 16 bits, and process all three RGB
23329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * components at the same time.
23339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 */
23349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = (s | s << 16) & 0x03e07c1f;
23359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = (d | d << 16) & 0x03e07c1f;
23369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d += (s - d) * alpha >> 5;
23379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d &= 0x03e07c1f;
23389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*dstp++ = (Uint16)(d | d >> 16);
23399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			},{
23409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = *srcp++;
23419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = *dstp;
23429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/*
23439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * shift out the middle component (green) to
23449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * the high 16 bits, and process all three RGB
23459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * components at the same time.
23469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 */
23479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = (s | s << 16) & 0x03e07c1f;
23489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = (d | d << 16) & 0x03e07c1f;
23499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d += (s - d) * alpha >> 5;
23509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d &= 0x03e07c1f;
23519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*dstp++ = (Uint16)(d | d >> 16);
23529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			        s = *srcp++;
23539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = *dstp;
23549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/*
23559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * shift out the middle component (green) to
23569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * the high 16 bits, and process all three RGB
23579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * components at the same time.
23589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 */
23599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = (s | s << 16) & 0x03e07c1f;
23609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = (d | d << 16) & 0x03e07c1f;
23619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d += (s - d) * alpha >> 5;
23629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d &= 0x03e07c1f;
23639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*dstp++ = (Uint16)(d | d >> 16);
23649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			},{
23659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
23669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
23679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
23689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* red -- process the bits in place */
23699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = src1;
23709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
23719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
23729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = dst1;
23739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
23749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
23759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* blend */
23769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
23779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
23789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
23799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
23809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
23819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
23829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				mm_res = dst2; /* RED -> mm_res */
23839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
23849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* green -- process the bits in place */
23859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = src1;
23869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
23879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
23889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = dst1;
23899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
23909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
23919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* blend */
23929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
23939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
23949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
23959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
23969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
23979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
23989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
23999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* blue */
24009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = src1; /* src -> src2 */
24019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
24029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
24039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = dst1; /* dst -> dst2 */
24049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
24059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
24069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/* blend */
24079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
24089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
24099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
24109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
24119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
24129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
24139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
24149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
24159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
24169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
24179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				srcp += 4;
24189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				dstp += 4;
24199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			}, width);
24209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			srcp += srcskip;
24219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dstp += dstskip;
24229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
24239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		_mm_empty();
24249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
24259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
24269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
24279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
24289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* fast RGB565->RGB565 blending with surface alpha */
24299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
24309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
24319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	unsigned alpha = info->src->alpha;
24329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	if(alpha == 128) {
24339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Blit16to16SurfaceAlpha128(info, 0xf7de);
24349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	} else {
24359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int width = info->d_width;
24369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int height = info->d_height;
24379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint16 *srcp = (Uint16 *)info->s_pixels;
24389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int srcskip = info->s_skip >> 1;
24399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint16 *dstp = (Uint16 *)info->d_pixels;
24409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int dstskip = info->d_skip >> 1;
24419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		alpha >>= 3;	/* downscale alpha to 5 bits */
24429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
24439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		while(height--) {
24449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			DUFFS_LOOP4({
24459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				Uint32 s = *srcp++;
24469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				Uint32 d = *dstp;
24479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/*
24489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * shift out the middle component (green) to
24499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * the high 16 bits, and process all three RGB
24509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * components at the same time.
24519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 */
24529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = (s | s << 16) & 0x07e0f81f;
24539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = (d | d << 16) & 0x07e0f81f;
24549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d += (s - d) * alpha >> 5;
24559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d &= 0x07e0f81f;
24569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*dstp++ = (Uint16)(d | d >> 16);
24579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			}, width);
24589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			srcp += srcskip;
24599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dstp += dstskip;
24609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
24619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
24629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
24639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
24649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* fast RGB555->RGB555 blending with surface alpha */
24659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
24669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
24679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
24689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	if(alpha == 128) {
24699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Blit16to16SurfaceAlpha128(info, 0xfbde);
24709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	} else {
24719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int width = info->d_width;
24729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int height = info->d_height;
24739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint16 *srcp = (Uint16 *)info->s_pixels;
24749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int srcskip = info->s_skip >> 1;
24759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint16 *dstp = (Uint16 *)info->d_pixels;
24769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		int dstskip = info->d_skip >> 1;
24779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		alpha >>= 3;		/* downscale alpha to 5 bits */
24789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
24799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		while(height--) {
24809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			DUFFS_LOOP4({
24819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				Uint32 s = *srcp++;
24829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				Uint32 d = *dstp;
24839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				/*
24849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * shift out the middle component (green) to
24859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * the high 16 bits, and process all three RGB
24869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 * components at the same time.
24879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				 */
24889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				s = (s | s << 16) & 0x03e07c1f;
24899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d = (d | d << 16) & 0x03e07c1f;
24909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d += (s - d) * alpha >> 5;
24919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				d &= 0x03e07c1f;
24929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				*dstp++ = (Uint16)(d | d >> 16);
24939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			}, width);
24949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			srcp += srcskip;
24959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			dstp += dstskip;
24969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
24979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
24989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
24999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
25009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* fast ARGB8888->RGB565 blending with pixel alpha */
25019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
25029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
25039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int width = info->d_width;
25049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int height = info->d_height;
25059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 *srcp = (Uint32 *)info->s_pixels;
25069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int srcskip = info->s_skip >> 2;
25079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint16 *dstp = (Uint16 *)info->d_pixels;
25089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int dstskip = info->d_skip >> 1;
25099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
25109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	while(height--) {
25119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    DUFFS_LOOP4({
25129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 s = *srcp;
25139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
25149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		/* FIXME: Here we special-case opaque alpha since the
25159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   compositioning used (>>8 instead of /255) doesn't handle
25169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   it correctly. Also special-case alpha=0 for speed?
25179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   Benchmark this! */
25189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		if(alpha) {
25199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
25209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
25219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		  } else {
25229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    Uint32 d = *dstp;
25239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    /*
25249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		     * convert source and destination to G0RAB65565
25259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		     * and blend all components at the same time
25269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		     */
25279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
25289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		      + (s >> 3 & 0x1f);
25299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    d = (d | d << 16) & 0x07e0f81f;
25309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    d += (s - d) * alpha >> 5;
25319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    d &= 0x07e0f81f;
25329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    *dstp = (Uint16)(d | d >> 16);
25339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		  }
25349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
25359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		srcp++;
25369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		dstp++;
25379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    }, width);
25389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    srcp += srcskip;
25399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    dstp += dstskip;
25409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
25419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
25429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
25439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* fast ARGB8888->RGB555 blending with pixel alpha */
25449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
25459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
25469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int width = info->d_width;
25479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int height = info->d_height;
25489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 *srcp = (Uint32 *)info->s_pixels;
25499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int srcskip = info->s_skip >> 2;
25509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint16 *dstp = (Uint16 *)info->d_pixels;
25519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int dstskip = info->d_skip >> 1;
25529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
25539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	while(height--) {
25549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    DUFFS_LOOP4({
25559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned alpha;
25569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 s = *srcp;
25579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		alpha = s >> 27; /* downscale alpha to 5 bits */
25589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		/* FIXME: Here we special-case opaque alpha since the
25599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   compositioning used (>>8 instead of /255) doesn't handle
25609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   it correctly. Also special-case alpha=0 for speed?
25619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   Benchmark this! */
25629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		if(alpha) {
25639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
25649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
25659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		  } else {
25669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    Uint32 d = *dstp;
25679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    /*
25689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		     * convert source and destination to G0RAB65565
25699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		     * and blend all components at the same time
25709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		     */
25719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
25729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		      + (s >> 3 & 0x1f);
25739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    d = (d | d << 16) & 0x03e07c1f;
25749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    d += (s - d) * alpha >> 5;
25759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    d &= 0x03e07c1f;
25769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    *dstp = (Uint16)(d | d >> 16);
25779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		  }
25789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
25799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		srcp++;
25809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		dstp++;
25819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    }, width);
25829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    srcp += srcskip;
25839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    dstp += dstskip;
25849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
25859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
25869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
25879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* General (slow) N->N blending with per-surface alpha */
25889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
25899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
25909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int width = info->d_width;
25919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int height = info->d_height;
25929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint8 *src = info->s_pixels;
25939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int srcskip = info->s_skip;
25949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint8 *dst = info->d_pixels;
25959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int dstskip = info->d_skip;
25969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	SDL_PixelFormat *srcfmt = info->src;
25979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	SDL_PixelFormat *dstfmt = info->dst;
25989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int srcbpp = srcfmt->BytesPerPixel;
25999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int dstbpp = dstfmt->BytesPerPixel;
26009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	unsigned sA = srcfmt->alpha;
26019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
26029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
26039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	if(sA) {
26049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	  while ( height-- ) {
26059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    DUFFS_LOOP4(
26069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    {
26079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 Pixel;
26089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned sR;
26099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned sG;
26109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned sB;
26119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned dR;
26129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned dG;
26139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned dB;
26149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
26159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
26169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
26179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
26189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		src += srcbpp;
26199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		dst += dstbpp;
26209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    },
26219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    width);
26229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    src += srcskip;
26239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    dst += dstskip;
26249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	  }
26259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
26269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
26279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
26289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* General (slow) colorkeyed N->N blending with per-surface alpha */
26299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
26309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
26319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int width = info->d_width;
26329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int height = info->d_height;
26339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint8 *src = info->s_pixels;
26349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int srcskip = info->s_skip;
26359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint8 *dst = info->d_pixels;
26369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int dstskip = info->d_skip;
26379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	SDL_PixelFormat *srcfmt = info->src;
26389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	SDL_PixelFormat *dstfmt = info->dst;
26399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint32 ckey = srcfmt->colorkey;
26409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int srcbpp = srcfmt->BytesPerPixel;
26419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int dstbpp = dstfmt->BytesPerPixel;
26429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	unsigned sA = srcfmt->alpha;
26439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
26449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
26459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	while ( height-- ) {
26469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    DUFFS_LOOP4(
26479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    {
26489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 Pixel;
26499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned sR;
26509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned sG;
26519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned sB;
26529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned dR;
26539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned dG;
26549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned dB;
26559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
26569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		if(sA && Pixel != ckey) {
26579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
26589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
26599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
26609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
26619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
26629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		src += srcbpp;
26639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		dst += dstbpp;
26649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    },
26659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    width);
26669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    src += srcskip;
26679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    dst += dstskip;
26689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
26699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
26709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
26719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* General (slow) N->N blending with pixel alpha */
26729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
26739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
26749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int width = info->d_width;
26759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int height = info->d_height;
26769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint8 *src = info->s_pixels;
26779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int srcskip = info->s_skip;
26789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	Uint8 *dst = info->d_pixels;
26799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int dstskip = info->d_skip;
26809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	SDL_PixelFormat *srcfmt = info->src;
26819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	SDL_PixelFormat *dstfmt = info->dst;
26829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
26839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int  srcbpp;
26849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	int  dstbpp;
26859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
26869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	/* Set up some basic variables */
26879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	srcbpp = srcfmt->BytesPerPixel;
26889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	dstbpp = dstfmt->BytesPerPixel;
26899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
26909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	/* FIXME: for 8bpp source alpha, this doesn't get opaque values
26919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	   quite right. for <8bpp source alpha, it gets them very wrong
26929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	   (check all macros!)
26939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	   It is unclear whether there is a good general solution that doesn't
26949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	   need a branch (or a divide). */
26959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	while ( height-- ) {
26969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    DUFFS_LOOP4(
26979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    {
26989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		Uint32 Pixel;
26999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned sR;
27009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned sG;
27019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned sB;
27029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned dR;
27039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned dG;
27049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned dB;
27059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned sA;
27069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		unsigned dA;
27079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
27089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		if(sA) {
27099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
27109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
27119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
27129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
27139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		src += srcbpp;
27149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		dst += dstbpp;
27159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    },
27169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    width);
27179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    src += srcskip;
27189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    dst += dstskip;
27199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
27209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
27219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
27229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
27239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse HallSDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
27249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall{
27259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    SDL_PixelFormat *sf = surface->format;
27269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    SDL_PixelFormat *df = surface->map->dst->format;
27279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
27289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    if(sf->Amask == 0) {
27299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
27309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    if(df->BytesPerPixel == 1)
27319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		return BlitNto1SurfaceAlphaKey;
27329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    else
27339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#if SDL_ALTIVEC_BLITTERS
27349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
27359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
27369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            return Blit32to32SurfaceAlphaKeyAltivec;
27379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        else
27389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#endif
27399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            return BlitNtoNSurfaceAlphaKey;
27409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	} else {
27419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    /* Per-surface alpha blits */
27429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    switch(df->BytesPerPixel) {
27439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    case 1:
27449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		return BlitNto1SurfaceAlpha;
27459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
27469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    case 2:
27479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		if(surface->map->identity) {
27489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    if(df->Gmask == 0x7e0)
27499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    {
27509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#if MMX_ASMBLIT
27519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		if(SDL_HasMMX())
27529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			return Blit565to565SurfaceAlphaMMX;
27539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		else
27549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#endif
27559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			return Blit565to565SurfaceAlpha;
27569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    }
27579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    else if(df->Gmask == 0x3e0)
27589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    {
27599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#if MMX_ASMBLIT
27609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		if(SDL_HasMMX())
27619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			return Blit555to555SurfaceAlphaMMX;
27629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		else
27639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#endif
27649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			return Blit555to555SurfaceAlpha;
27659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    }
27669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
27679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		return BlitNtoNSurfaceAlpha;
27689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
27699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    case 4:
27709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		if(sf->Rmask == df->Rmask
27719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   && sf->Gmask == df->Gmask
27729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   && sf->Bmask == df->Bmask
27739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   && sf->BytesPerPixel == 4)
27749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		{
27759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#if MMX_ASMBLIT
27769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			if(sf->Rshift % 8 == 0
27779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			   && sf->Gshift % 8 == 0
27789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			   && sf->Bshift % 8 == 0
27799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			   && SDL_HasMMX())
27809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			    return BlitRGBtoRGBSurfaceAlphaMMX;
27819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#endif
27829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
27839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			{
27849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#if SDL_ALTIVEC_BLITTERS
27859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				if(!(surface->map->dst->flags & SDL_HWSURFACE)
27869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall					&& SDL_HasAltiVec())
27879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall					return BlitRGBtoRGBSurfaceAlphaAltivec;
27889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#endif
27899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				return BlitRGBtoRGBSurfaceAlpha;
27909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			}
27919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
27929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#if SDL_ALTIVEC_BLITTERS
27939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		if((sf->BytesPerPixel == 4) &&
27949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
27959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			return Blit32to32SurfaceAlphaAltivec;
27969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		else
27979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#endif
27989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			return BlitNtoNSurfaceAlpha;
27999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
28009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    case 3:
28019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    default:
28029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		return BlitNtoNSurfaceAlpha;
28039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    }
28049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
28059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    } else {
28069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	/* Per-pixel alpha blits */
28079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	switch(df->BytesPerPixel) {
28089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	case 1:
28099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    return BlitNto1PixelAlpha;
28109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
28119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	case 2:
28129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#if SDL_ALTIVEC_BLITTERS
28139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
28149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall           df->Gmask == 0x7e0 &&
28159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	   df->Bmask == 0x1f && SDL_HasAltiVec())
28169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall            return Blit32to565PixelAlphaAltivec;
28179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall        else
28189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#endif
28199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
28209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	       && sf->Gmask == 0xff00
28219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	       && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
28229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
28239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		if(df->Gmask == 0x7e0)
28249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    return BlitARGBto565PixelAlpha;
28259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		else if(df->Gmask == 0x3e0)
28269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		    return BlitARGBto555PixelAlpha;
28279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    }
28289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    return BlitNtoNPixelAlpha;
28299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
28309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	case 4:
28319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    if(sf->Rmask == df->Rmask
28329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	       && sf->Gmask == df->Gmask
28339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	       && sf->Bmask == df->Bmask
28349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	       && sf->BytesPerPixel == 4)
28359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    {
28369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#if MMX_ASMBLIT
28379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		if(sf->Rshift % 8 == 0
28389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   && sf->Gshift % 8 == 0
28399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   && sf->Bshift % 8 == 0
28409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   && sf->Ashift % 8 == 0
28419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		   && sf->Aloss == 0)
28429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		{
28439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			if(SDL_Has3DNow())
28449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				return BlitRGBtoRGBPixelAlphaMMX3DNOW;
28459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			if(SDL_HasMMX())
28469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				return BlitRGBtoRGBPixelAlphaMMX;
28479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
28489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#endif
28499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		if(sf->Amask == 0xff000000)
28509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		{
28519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#if SDL_ALTIVEC_BLITTERS
28529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			if(!(surface->map->dst->flags & SDL_HWSURFACE)
28539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				&& SDL_HasAltiVec())
28549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall				return BlitRGBtoRGBPixelAlphaAltivec;
28559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#endif
28569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall			return BlitRGBtoRGBPixelAlpha;
28579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		}
28589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    }
28599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#if SDL_ALTIVEC_BLITTERS
28609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    if (sf->Amask && sf->BytesPerPixel == 4 &&
28619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	        !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
28629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		return Blit32to32PixelAlphaAltivec;
28639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    else
28649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#endif
28659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall		return BlitNtoNPixelAlpha;
28669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
28679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	case 3:
28689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	default:
28699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	    return BlitNtoNPixelAlpha;
28709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall	}
28719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    }
28729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
28739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
2874