1/*
2    SDL - Simple DirectMedia Layer
3    Copyright (C) 1997-2012 Sam Lantinga
4
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
9
10    This library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public
16    License along with this library; if not, write to the Free Software
17    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
18
19    Sam Lantinga
20    slouken@libsdl.org
21*/
22#include "SDL_config.h"
23
24#include "SDL_video.h"
25#include "SDL_blit.h"
26
27/*
28  In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
29   Checking if _mm_free is #defined in malloc.h is is the only way to
30   determine if the Processor Pack is installed, as far as I can tell.
31*/
32
33#if SDL_ASSEMBLY_ROUTINES
34#  if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
35     /* forced MMX to 0...it breaks on most compilers now.  --ryan. */
36#    define MMX_ASMBLIT 0
37#    define GCC_ASMBLIT 0
38#  elif defined(_MSC_VER) && defined(_M_IX86)
39#    if (_MSC_VER <= 1200)
40#      include <malloc.h>
41#      if defined(_mm_free)
42#          define HAVE_MMINTRIN_H 1
43#      endif
44#    else  /* Visual Studio > VC6 always has mmintrin.h */
45#      define HAVE_MMINTRIN_H 1
46#    endif
47#    if HAVE_MMINTRIN_H
48#      define MMX_ASMBLIT 1
49#      define MSVC_ASMBLIT 1
50#    endif
51#  endif
52#endif /* SDL_ASSEMBLY_ROUTINES */
53
54/* Function to check the CPU flags */
55#include "SDL_cpuinfo.h"
56#if GCC_ASMBLIT
57#include "mmx.h"
58#elif MSVC_ASMBLIT
59#include <mmintrin.h>
60#include <mm3dnow.h>
61#endif
62
63/* Functions to perform alpha blended blitting */
64
65/* N->1 blending with per-surface alpha */
66static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
67{
68	int width = info->d_width;
69	int height = info->d_height;
70	Uint8 *src = info->s_pixels;
71	int srcskip = info->s_skip;
72	Uint8 *dst = info->d_pixels;
73	int dstskip = info->d_skip;
74	Uint8 *palmap = info->table;
75	SDL_PixelFormat *srcfmt = info->src;
76	SDL_PixelFormat *dstfmt = info->dst;
77	int srcbpp = srcfmt->BytesPerPixel;
78
79	const unsigned A = srcfmt->alpha;
80
81	while ( height-- ) {
82	    DUFFS_LOOP4(
83	    {
84		Uint32 Pixel;
85		unsigned sR;
86		unsigned sG;
87		unsigned sB;
88		unsigned dR;
89		unsigned dG;
90		unsigned dB;
91		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
92		dR = dstfmt->palette->colors[*dst].r;
93		dG = dstfmt->palette->colors[*dst].g;
94		dB = dstfmt->palette->colors[*dst].b;
95		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
96		dR &= 0xff;
97		dG &= 0xff;
98		dB &= 0xff;
99		/* Pack RGB into 8bit pixel */
100		if ( palmap == NULL ) {
101		    *dst =((dR>>5)<<(3+2))|
102			  ((dG>>5)<<(2))|
103			  ((dB>>6)<<(0));
104		} else {
105		    *dst = palmap[((dR>>5)<<(3+2))|
106				  ((dG>>5)<<(2))  |
107				  ((dB>>6)<<(0))];
108		}
109		dst++;
110		src += srcbpp;
111	    },
112	    width);
113	    src += srcskip;
114	    dst += dstskip;
115	}
116}
117
118/* N->1 blending with pixel alpha */
119static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
120{
121	int width = info->d_width;
122	int height = info->d_height;
123	Uint8 *src = info->s_pixels;
124	int srcskip = info->s_skip;
125	Uint8 *dst = info->d_pixels;
126	int dstskip = info->d_skip;
127	Uint8 *palmap = info->table;
128	SDL_PixelFormat *srcfmt = info->src;
129	SDL_PixelFormat *dstfmt = info->dst;
130	int srcbpp = srcfmt->BytesPerPixel;
131
132	/* FIXME: fix alpha bit field expansion here too? */
133	while ( height-- ) {
134	    DUFFS_LOOP4(
135	    {
136		Uint32 Pixel;
137		unsigned sR;
138		unsigned sG;
139		unsigned sB;
140		unsigned sA;
141		unsigned dR;
142		unsigned dG;
143		unsigned dB;
144		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
145		dR = dstfmt->palette->colors[*dst].r;
146		dG = dstfmt->palette->colors[*dst].g;
147		dB = dstfmt->palette->colors[*dst].b;
148		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
149		dR &= 0xff;
150		dG &= 0xff;
151		dB &= 0xff;
152		/* Pack RGB into 8bit pixel */
153		if ( palmap == NULL ) {
154		    *dst =((dR>>5)<<(3+2))|
155			  ((dG>>5)<<(2))|
156			  ((dB>>6)<<(0));
157		} else {
158		    *dst = palmap[((dR>>5)<<(3+2))|
159				  ((dG>>5)<<(2))  |
160				  ((dB>>6)<<(0))  ];
161		}
162		dst++;
163		src += srcbpp;
164	    },
165	    width);
166	    src += srcskip;
167	    dst += dstskip;
168	}
169}
170
171/* colorkeyed N->1 blending with per-surface alpha */
172static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
173{
174	int width = info->d_width;
175	int height = info->d_height;
176	Uint8 *src = info->s_pixels;
177	int srcskip = info->s_skip;
178	Uint8 *dst = info->d_pixels;
179	int dstskip = info->d_skip;
180	Uint8 *palmap = info->table;
181	SDL_PixelFormat *srcfmt = info->src;
182	SDL_PixelFormat *dstfmt = info->dst;
183	int srcbpp = srcfmt->BytesPerPixel;
184	Uint32 ckey = srcfmt->colorkey;
185
186	const int A = srcfmt->alpha;
187
188	while ( height-- ) {
189	    DUFFS_LOOP(
190	    {
191		Uint32 Pixel;
192		unsigned sR;
193		unsigned sG;
194		unsigned sB;
195		unsigned dR;
196		unsigned dG;
197		unsigned dB;
198		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
199		if ( Pixel != ckey ) {
200		    dR = dstfmt->palette->colors[*dst].r;
201		    dG = dstfmt->palette->colors[*dst].g;
202		    dB = dstfmt->palette->colors[*dst].b;
203		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
204		    dR &= 0xff;
205		    dG &= 0xff;
206		    dB &= 0xff;
207		    /* Pack RGB into 8bit pixel */
208		    if ( palmap == NULL ) {
209			*dst =((dR>>5)<<(3+2))|
210			      ((dG>>5)<<(2)) |
211			      ((dB>>6)<<(0));
212		    } else {
213			*dst = palmap[((dR>>5)<<(3+2))|
214				      ((dG>>5)<<(2))  |
215				      ((dB>>6)<<(0))  ];
216		    }
217		}
218		dst++;
219		src += srcbpp;
220	    },
221	    width);
222	    src += srcskip;
223	    dst += dstskip;
224	}
225}
226
227#if GCC_ASMBLIT
228/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
229static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
230{
231	int width = info->d_width;
232	int height = info->d_height;
233	Uint32 *srcp = (Uint32 *)info->s_pixels;
234	int srcskip = info->s_skip >> 2;
235	Uint32 *dstp = (Uint32 *)info->d_pixels;
236	int dstskip = info->d_skip >> 2;
237	Uint32 dalpha = info->dst->Amask;
238	Uint64 load;
239
240	load = 0x00fefefe00fefefeULL;/* alpha128 mask */
241	movq_m2r(load, mm4); /* alpha128 mask -> mm4 */
242	load = 0x0001010100010101ULL;/* !alpha128 mask */
243	movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */
244	movd_m2r(dalpha, mm7); /* dst alpha mask */
245	punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
246	while(height--) {
247		DUFFS_LOOP_DOUBLE2(
248		{
249			Uint32 s = *srcp++;
250			Uint32 d = *dstp;
251			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
252				   + (s & d & 0x00010101)) | dalpha;
253		},{
254			movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
255			movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
256
257			movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
258			movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
259
260			pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
261			pand_r2r(mm4, mm5); /* src & mask -> mm5 */
262			paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
263			pand_r2r(mm1, mm2); /* src & dst -> mm2 */
264			psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
265			pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
266			paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
267
268			por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
269			movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
270			dstp += 2;
271			srcp += 2;
272		}, width);
273		srcp += srcskip;
274		dstp += dstskip;
275	}
276	emms();
277}
278
279/* fast RGB888->(A)RGB888 blending with surface alpha */
280static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
281{
282	SDL_PixelFormat* df = info->dst;
283	unsigned alpha = info->src->alpha;
284
285	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
286			/* only call a128 version when R,G,B occupy lower bits */
287		BlitRGBtoRGBSurfaceAlpha128MMX(info);
288	} else {
289		int width = info->d_width;
290		int height = info->d_height;
291		Uint32 *srcp = (Uint32 *)info->s_pixels;
292		int srcskip = info->s_skip >> 2;
293		Uint32 *dstp = (Uint32 *)info->d_pixels;
294		int dstskip = info->d_skip >> 2;
295
296		pxor_r2r(mm5, mm5); /* 0 -> mm5 */
297		/* form the alpha mult */
298		movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
299		punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
300		punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
301		alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
302		movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
303		punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
304		pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
305			/* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
306		movd_m2r(df->Amask, mm7); /* dst alpha mask */
307		punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
308
309		while(height--) {
310			DUFFS_LOOP_DOUBLE2({
311				/* One Pixel Blend */
312				movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
313				movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
314				punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
315				punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
316
317				psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
318				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
319				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
320				paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
321
322				packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
323				por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
324				movd_r2m(mm2, *dstp);/* mm2 -> pixel */
325				++srcp;
326				++dstp;
327			},{
328				/* Two Pixels Blend */
329				movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
330				movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
331				movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
332				movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
333
334				punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
335				punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
336				punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
337				punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
338
339				psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
340				pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
341				psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
342				paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
343
344				psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
345				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
346				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
347				paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
348
349				packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
350				por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
351
352				movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
353
354  				srcp += 2;
355  				dstp += 2;
356  			}, width);
357			srcp += srcskip;
358			dstp += dstskip;
359		}
360		emms();
361	}
362}
363
364/* fast ARGB888->(A)RGB888 blending with pixel alpha */
365static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
366{
367	int width = info->d_width;
368	int height = info->d_height;
369	Uint32 *srcp = (Uint32 *)info->s_pixels;
370	int srcskip = info->s_skip >> 2;
371	Uint32 *dstp = (Uint32 *)info->d_pixels;
372	int dstskip = info->d_skip >> 2;
373	SDL_PixelFormat* sf = info->src;
374	Uint32 amask = sf->Amask;
375
376	pxor_r2r(mm6, mm6); /* 0 -> mm6 */
377	/* form multiplication mask */
378	movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
379	punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
380	pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
381	movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
382	pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
383	/* form channel masks */
384	movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
385	packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
386	packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
387	pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
388	/* get alpha channel shift */
389	__asm__ __volatile__ (
390		"movd %0, %%mm5"
391		: : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
392
393	while(height--) {
394	    DUFFS_LOOP4({
395		Uint32 alpha = *srcp & amask;
396		/* FIXME: Here we special-case opaque alpha since the
397			compositioning used (>>8 instead of /255) doesn't handle
398			it correctly. Also special-case alpha=0 for speed?
399			Benchmark this! */
400		if(alpha == 0) {
401			/* do nothing */
402		} else if(alpha == amask) {
403			/* opaque alpha -- copy RGB, keep dst alpha */
404			/* using MMX here to free up regular registers for other things */
405			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
406			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
407			pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
408			pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
409			por_r2r(mm1, mm2); /* src | dst -> mm2 */
410			movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
411		} else {
412			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
413			punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
414
415			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
416			punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
417
418			__asm__ __volatile__ (
419				"movd %0, %%mm4"
420				: : "r" (alpha) ); /* 0000A000 -> mm4 */
421			psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
422			punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
423			punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
424			pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
425
426			/* blend */
427			psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
428			pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
429			psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
430			paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
431
432			packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
433			movd_r2m(mm2, *dstp);/* mm2 -> dst */
434		}
435		++srcp;
436		++dstp;
437	    }, width);
438	    srcp += srcskip;
439	    dstp += dstskip;
440	}
441	emms();
442}
443/* End GCC_ASMBLIT */
444
445#elif MSVC_ASMBLIT
446/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
447static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
448{
449	int width = info->d_width;
450	int height = info->d_height;
451	Uint32 *srcp = (Uint32 *)info->s_pixels;
452	int srcskip = info->s_skip >> 2;
453	Uint32 *dstp = (Uint32 *)info->d_pixels;
454	int dstskip = info->d_skip >> 2;
455	Uint32 dalpha = info->dst->Amask;
456
457	__m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
458
459	hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
460	lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
461	dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
462
463	while (height--) {
464		int n = width;
465		if ( n & 1 ) {
466			Uint32 s = *srcp++;
467			Uint32 d = *dstp;
468			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
469				   + (s & d & 0x00010101)) | dalpha;
470			n--;
471		}
472
473		for (n >>= 1; n > 0; --n) {
474			dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
475			dst2 = dst1;   /* 2 x dst -> dst2(ARGBARGB) */
476
477			src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
478			src2 = src1; /* 2 x src -> src2(ARGBARGB) */
479
480			dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
481			src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
482			src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
483			src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
484
485			dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
486			dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
487			dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
488			dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
489
490			*(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
491			dstp += 2;
492			srcp += 2;
493		}
494
495		srcp += srcskip;
496		dstp += dstskip;
497	}
498	_mm_empty();
499}
500
501/* fast RGB888->(A)RGB888 blending with surface alpha */
502static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
503{
504	SDL_PixelFormat* df = info->dst;
505	Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
506	unsigned alpha = info->src->alpha;
507
508	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
509			/* only call a128 version when R,G,B occupy lower bits */
510		BlitRGBtoRGBSurfaceAlpha128MMX(info);
511	} else {
512		int width = info->d_width;
513		int height = info->d_height;
514		Uint32 *srcp = (Uint32 *)info->s_pixels;
515		int srcskip = info->s_skip >> 2;
516		Uint32 *dstp = (Uint32 *)info->d_pixels;
517		int dstskip = info->d_skip >> 2;
518		Uint32 dalpha = df->Amask;
519		Uint32 amult;
520
521		__m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
522
523		mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
524		/* form the alpha mult */
525		amult = alpha | (alpha << 8);
526		amult = amult | (amult << 16);
527		chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
528		mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
529		mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
530			/* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
531		dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
532
533		while (height--) {
534			int n = width;
535			if (n & 1) {
536				/* One Pixel Blend */
537				src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
538				src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
539
540				dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
541				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
542
543				src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
544				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
545				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
546				dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
547
548				dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
549				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
550				*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
551
552				++srcp;
553				++dstp;
554
555				n--;
556			}
557
558			for (n >>= 1; n > 0; --n) {
559				/* Two Pixels Blend */
560				src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
561				src2 = src1; /* 2 x src -> src2(ARGBARGB) */
562				src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
563				src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
564
565				dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
566				dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
567				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
568				dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
569
570				src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
571				src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
572				src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
573				dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
574
575				src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
576				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
577				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
578				dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
579
580				dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
581				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
582
583				*(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
584
585				srcp += 2;
586				dstp += 2;
587			}
588			srcp += srcskip;
589			dstp += dstskip;
590		}
591		_mm_empty();
592	}
593}
594
595/* fast ARGB888->(A)RGB888 blending with pixel alpha */
596static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
597{
598	int width = info->d_width;
599	int height = info->d_height;
600	Uint32 *srcp = (Uint32 *)info->s_pixels;
601	int srcskip = info->s_skip >> 2;
602	Uint32 *dstp = (Uint32 *)info->d_pixels;
603	int dstskip = info->d_skip >> 2;
604	SDL_PixelFormat* sf = info->src;
605	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
606	Uint32 amask = sf->Amask;
607	Uint32 ashift = sf->Ashift;
608	Uint64 multmask;
609
610	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
611
612	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
613	multmask = ~(0xFFFFi64 << (ashift * 2));
614	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
615
616	while(height--) {
617		DUFFS_LOOP4({
618		Uint32 alpha = *srcp & amask;
619		if (alpha == 0) {
620			/* do nothing */
621		} else if (alpha == amask) {
622			/* opaque alpha -- copy RGB, keep dst alpha */
623			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
624		} else {
625			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
626			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
627
628			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
629			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
630
631			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
632			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
633			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
634			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
635			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
636
637			/* blend */
638			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
639			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
640			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
641			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
642			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
643
644			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
645		}
646		++srcp;
647		++dstp;
648	    }, width);
649	    srcp += srcskip;
650	    dstp += dstskip;
651	}
652	_mm_empty();
653}
654/* End MSVC_ASMBLIT */
655
656#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
657
658#if SDL_ALTIVEC_BLITTERS
659#if __MWERKS__
660#pragma altivec_model on
661#endif
662#if HAVE_ALTIVEC_H
663#include <altivec.h>
664#endif
665#include <assert.h>
666
667#if (defined(__MACOSX__) && (__GNUC__ < 4))
668    #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
669        (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
670    #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
671        (vector unsigned short) ( a,b,c,d,e,f,g,h )
672#else
673    #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
674        (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
675    #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
676        (vector unsigned short) { a,b,c,d,e,f,g,h }
677#endif
678
679#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
680#define VECPRINT(msg, v) do { \
681    vector unsigned int tmpvec = (vector unsigned int)(v); \
682    unsigned int *vp = (unsigned int *)&tmpvec; \
683    printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
684} while (0)
685
686/* the permuation vector that takes the high bytes out of all the appropriate shorts
687    (vector unsigned char)(
688        0x00, 0x10, 0x02, 0x12,
689        0x04, 0x14, 0x06, 0x16,
690        0x08, 0x18, 0x0A, 0x1A,
691        0x0C, 0x1C, 0x0E, 0x1E );
692*/
693#define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
694#define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
695#define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
696#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
697    ? vec_lvsl(0, src) \
698    : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
699
700
701#define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
702    /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
703    vector unsigned short vtemp1 = vec_mule(vs, valpha); \
704    /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
705    vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
706    /* valpha2 is 255-alpha */ \
707    vector unsigned char valpha2 = vec_nor(valpha, valpha); \
708    /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
709    vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
710    /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
711    vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
712    /* add source and dest */ \
713    vtemp1 = vec_add(vtemp1, vtemp3); \
714    vtemp2 = vec_add(vtemp2, vtemp4); \
715    /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
716    vtemp1 = vec_add(vtemp1, v1_16); \
717    vtemp3 = vec_sr(vtemp1, v8_16); \
718    vtemp1 = vec_add(vtemp1, vtemp3); \
719    /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
720    vtemp2 = vec_add(vtemp2, v1_16); \
721    vtemp4 = vec_sr(vtemp2, v8_16); \
722    vtemp2 = vec_add(vtemp2, vtemp4); \
723    /* (>>8) and get ARGBARGBARGBARGB */ \
724    vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
725} while (0)
726
727/* Calculate the permute vector used for 32->32 swizzling */
728static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
729                                  const SDL_PixelFormat *dstfmt)
730{
731    /*
732     * We have to assume that the bits that aren't used by other
733     *  colors is alpha, and it's one complete byte, since some formats
734     *  leave alpha with a zero mask, but we should still swizzle the bits.
735     */
736    /* ARGB */
737    const static struct SDL_PixelFormat default_pixel_format = {
738        NULL, 0, 0,
739        0, 0, 0, 0,
740        16, 8, 0, 24,
741        0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
742        0, 0};
743    if (!srcfmt) {
744        srcfmt = &default_pixel_format;
745    }
746    if (!dstfmt) {
747        dstfmt = &default_pixel_format;
748    }
749    const vector unsigned char plus = VECUINT8_LITERAL
750                                            ( 0x00, 0x00, 0x00, 0x00,
751                                              0x04, 0x04, 0x04, 0x04,
752                                              0x08, 0x08, 0x08, 0x08,
753                                              0x0C, 0x0C, 0x0C, 0x0C );
754    vector unsigned char vswiz;
755    vector unsigned int srcvec;
756#define RESHIFT(X) (3 - ((X) >> 3))
757    Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
758    Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
759    Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
760    Uint32 amask;
761    /* Use zero for alpha if either surface doesn't have alpha */
762    if (dstfmt->Amask) {
763        amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
764    } else {
765        amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
766    }
767#undef RESHIFT
768    ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
769    vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
770    return(vswiz);
771}
772
773static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
774{
775    int height = info->d_height;
776    Uint8 *src = (Uint8 *)info->s_pixels;
777    int srcskip = info->s_skip;
778    Uint8 *dst = (Uint8 *)info->d_pixels;
779    int dstskip = info->d_skip;
780    SDL_PixelFormat *srcfmt = info->src;
781
782    vector unsigned char v0 = vec_splat_u8(0);
783    vector unsigned short v8_16 = vec_splat_u16(8);
784    vector unsigned short v1_16 = vec_splat_u16(1);
785    vector unsigned short v2_16 = vec_splat_u16(2);
786    vector unsigned short v3_16 = vec_splat_u16(3);
787    vector unsigned int v8_32 = vec_splat_u32(8);
788    vector unsigned int v16_32 = vec_add(v8_32, v8_32);
789    vector unsigned short v3f = VECUINT16_LITERAL(
790        0x003f, 0x003f, 0x003f, 0x003f,
791        0x003f, 0x003f, 0x003f, 0x003f);
792    vector unsigned short vfc = VECUINT16_LITERAL(
793        0x00fc, 0x00fc, 0x00fc, 0x00fc,
794        0x00fc, 0x00fc, 0x00fc, 0x00fc);
795
796    /*
797        0x10 - 0x1f is the alpha
798        0x00 - 0x0e evens are the red
799        0x01 - 0x0f odds are zero
800    */
801    vector unsigned char vredalpha1 = VECUINT8_LITERAL(
802        0x10, 0x00, 0x01, 0x01,
803        0x10, 0x02, 0x01, 0x01,
804        0x10, 0x04, 0x01, 0x01,
805        0x10, 0x06, 0x01, 0x01
806    );
807    vector unsigned char vredalpha2 = (vector unsigned char)(
808        vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
809    );
810    /*
811        0x00 - 0x0f is ARxx ARxx ARxx ARxx
812        0x11 - 0x0f odds are blue
813    */
814    vector unsigned char vblue1 = VECUINT8_LITERAL(
815        0x00, 0x01, 0x02, 0x11,
816        0x04, 0x05, 0x06, 0x13,
817        0x08, 0x09, 0x0a, 0x15,
818        0x0c, 0x0d, 0x0e, 0x17
819    );
820    vector unsigned char vblue2 = (vector unsigned char)(
821        vec_add((vector unsigned int)vblue1, v8_32)
822    );
823    /*
824        0x00 - 0x0f is ARxB ARxB ARxB ARxB
825        0x10 - 0x0e evens are green
826    */
827    vector unsigned char vgreen1 = VECUINT8_LITERAL(
828        0x00, 0x01, 0x10, 0x03,
829        0x04, 0x05, 0x12, 0x07,
830        0x08, 0x09, 0x14, 0x0b,
831        0x0c, 0x0d, 0x16, 0x0f
832    );
833    vector unsigned char vgreen2 = (vector unsigned char)(
834        vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
835    );
836    vector unsigned char vgmerge = VECUINT8_LITERAL(
837        0x00, 0x02, 0x00, 0x06,
838        0x00, 0x0a, 0x00, 0x0e,
839        0x00, 0x12, 0x00, 0x16,
840        0x00, 0x1a, 0x00, 0x1e);
841    vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
842    vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
843    vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
844
845    vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
846    vf800 = vec_sl(vf800, vec_splat_u16(8));
847
848    while(height--) {
849        int extrawidth;
850        vector unsigned char valigner;
851        vector unsigned char vsrc;
852        vector unsigned char voverflow;
853        int width = info->d_width;
854
855#define ONE_PIXEL_BLEND(condition, widthvar) \
856        while (condition) { \
857            Uint32 Pixel; \
858            unsigned sR, sG, sB, dR, dG, dB, sA; \
859            DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
860            if(sA) { \
861                unsigned short dstpixel = *((unsigned short *)dst); \
862                dR = (dstpixel >> 8) & 0xf8; \
863                dG = (dstpixel >> 3) & 0xfc; \
864                dB = (dstpixel << 3) & 0xf8; \
865                ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
866                *((unsigned short *)dst) = ( \
867                    ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
868                ); \
869            } \
870            src += 4; \
871            dst += 2; \
872            widthvar--; \
873        }
874        ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
875        extrawidth = (width % 8);
876        valigner = VEC_ALIGNER(src);
877        vsrc = (vector unsigned char)vec_ld(0, src);
878        width -= extrawidth;
879        while (width) {
880            vector unsigned char valpha;
881            vector unsigned char vsrc1, vsrc2;
882            vector unsigned char vdst1, vdst2;
883            vector unsigned short vR, vG, vB;
884            vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
885
886            /* Load 8 pixels from src as ARGB */
887            voverflow = (vector unsigned char)vec_ld(15, src);
888            vsrc = vec_perm(vsrc, voverflow, valigner);
889            vsrc1 = vec_perm(vsrc, vsrc, vpermute);
890            src += 16;
891            vsrc = (vector unsigned char)vec_ld(15, src);
892            voverflow = vec_perm(voverflow, vsrc, valigner);
893            vsrc2 = vec_perm(voverflow, voverflow, vpermute);
894            src += 16;
895
896            /* Load 8 pixels from dst as XRGB */
897            voverflow = vec_ld(0, dst);
898            vR = vec_and((vector unsigned short)voverflow, vf800);
899            vB = vec_sl((vector unsigned short)voverflow, v3_16);
900            vG = vec_sl(vB, v2_16);
901            vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
902            vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
903            vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
904            vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
905            vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
906            vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
907
908            /* Alpha blend 8 pixels as ARGB */
909            valpha = vec_perm(vsrc1, v0, valphaPermute);
910            VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
911            valpha = vec_perm(vsrc2, v0, valphaPermute);
912            VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
913
914            /* Convert 8 pixels to 565 */
915            vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
916            vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
917            vgpixel = vec_and(vgpixel, vfc);
918            vgpixel = vec_sl(vgpixel, v3_16);
919            vrpixel = vec_sl(vpixel, v1_16);
920            vrpixel = vec_and(vrpixel, vf800);
921            vbpixel = vec_and(vpixel, v3f);
922            vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
923            vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
924
925            /* Store 8 pixels */
926            vec_st(vdst1, 0, dst);
927
928            width -= 8;
929            dst += 16;
930        }
931        ONE_PIXEL_BLEND((extrawidth), extrawidth);
932#undef ONE_PIXEL_BLEND
933        src += srcskip;
934        dst += dstskip;
935    }
936}
937
938static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
939{
940    unsigned alpha = info->src->alpha;
941    int height = info->d_height;
942    Uint32 *srcp = (Uint32 *)info->s_pixels;
943    int srcskip = info->s_skip >> 2;
944    Uint32 *dstp = (Uint32 *)info->d_pixels;
945    int dstskip = info->d_skip >> 2;
946    SDL_PixelFormat *srcfmt = info->src;
947    SDL_PixelFormat *dstfmt = info->dst;
948    unsigned sA = srcfmt->alpha;
949    unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
950    Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
951    Uint32 ckey = info->src->colorkey;
952    vector unsigned char mergePermute;
953    vector unsigned char vsrcPermute;
954    vector unsigned char vdstPermute;
955    vector unsigned char vsdstPermute;
956    vector unsigned char valpha;
957    vector unsigned char valphamask;
958    vector unsigned char vbits;
959    vector unsigned char v0;
960    vector unsigned short v1;
961    vector unsigned short v8;
962    vector unsigned int vckey;
963    vector unsigned int vrgbmask;
964
965    mergePermute = VEC_MERGE_PERMUTE();
966    v0 = vec_splat_u8(0);
967    v1 = vec_splat_u16(1);
968    v8 = vec_splat_u16(8);
969
970    /* set the alpha to 255 on the destination surf */
971    valphamask = VEC_ALPHA_MASK();
972
973    vsrcPermute = calc_swizzle32(srcfmt, NULL);
974    vdstPermute = calc_swizzle32(NULL, dstfmt);
975    vsdstPermute = calc_swizzle32(dstfmt, NULL);
976
977    /* set a vector full of alpha and 255-alpha */
978    ((unsigned char *)&valpha)[0] = alpha;
979    valpha = vec_splat(valpha, 0);
980    vbits = (vector unsigned char)vec_splat_s8(-1);
981
982    ckey &= rgbmask;
983    ((unsigned int *)(char*)&vckey)[0] = ckey;
984    vckey = vec_splat(vckey, 0);
985    ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
986    vrgbmask = vec_splat(vrgbmask, 0);
987
988    while(height--) {
989        int width = info->d_width;
990#define ONE_PIXEL_BLEND(condition, widthvar) \
991        while (condition) { \
992            Uint32 Pixel; \
993            unsigned sR, sG, sB, dR, dG, dB; \
994            RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
995            if(sA && Pixel != ckey) { \
996                RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
997                DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
998                ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
999                ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1000            } \
1001            dstp++; \
1002            srcp++; \
1003            widthvar--; \
1004        }
1005        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1006        if (width > 0) {
1007            int extrawidth = (width % 4);
1008            vector unsigned char valigner = VEC_ALIGNER(srcp);
1009            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1010            width -= extrawidth;
1011            while (width) {
1012                vector unsigned char vsel;
1013                vector unsigned char voverflow;
1014                vector unsigned char vd;
1015                vector unsigned char vd_orig;
1016
1017                /* s = *srcp */
1018                voverflow = (vector unsigned char)vec_ld(15, srcp);
1019                vs = vec_perm(vs, voverflow, valigner);
1020
1021                /* vsel is set for items that match the key */
1022                vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
1023                vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
1024
1025                /* permute to source format */
1026                vs = vec_perm(vs, valpha, vsrcPermute);
1027
1028                /* d = *dstp */
1029                vd = (vector unsigned char)vec_ld(0, dstp);
1030                vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
1031
1032                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1033
1034                /* set the alpha channel to full on */
1035                vd = vec_or(vd, valphamask);
1036
1037                /* mask out color key */
1038                vd = vec_sel(vd, vd_orig, vsel);
1039
1040                /* permute to dest format */
1041                vd = vec_perm(vd, vbits, vdstPermute);
1042
1043                /* *dstp = res */
1044                vec_st((vector unsigned int)vd, 0, dstp);
1045
1046                srcp += 4;
1047                dstp += 4;
1048                width -= 4;
1049                vs = voverflow;
1050            }
1051            ONE_PIXEL_BLEND((extrawidth), extrawidth);
1052        }
1053#undef ONE_PIXEL_BLEND
1054
1055        srcp += srcskip;
1056        dstp += dstskip;
1057    }
1058}
1059
1060
1061static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
1062{
1063    int width = info->d_width;
1064    int height = info->d_height;
1065    Uint32 *srcp = (Uint32 *)info->s_pixels;
1066    int srcskip = info->s_skip >> 2;
1067    Uint32 *dstp = (Uint32 *)info->d_pixels;
1068    int dstskip = info->d_skip >> 2;
1069    SDL_PixelFormat *srcfmt = info->src;
1070    SDL_PixelFormat *dstfmt = info->dst;
1071    vector unsigned char mergePermute;
1072    vector unsigned char valphaPermute;
1073    vector unsigned char vsrcPermute;
1074    vector unsigned char vdstPermute;
1075    vector unsigned char vsdstPermute;
1076    vector unsigned char valphamask;
1077    vector unsigned char vpixelmask;
1078    vector unsigned char v0;
1079    vector unsigned short v1;
1080    vector unsigned short v8;
1081
1082    v0 = vec_splat_u8(0);
1083    v1 = vec_splat_u16(1);
1084    v8 = vec_splat_u16(8);
1085    mergePermute = VEC_MERGE_PERMUTE();
1086    valphamask = VEC_ALPHA_MASK();
1087    valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1088    vpixelmask = vec_nor(valphamask, v0);
1089    vsrcPermute = calc_swizzle32(srcfmt, NULL);
1090    vdstPermute = calc_swizzle32(NULL, dstfmt);
1091    vsdstPermute = calc_swizzle32(dstfmt, NULL);
1092
1093	while ( height-- ) {
1094        width = info->d_width;
1095#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1096            Uint32 Pixel; \
1097            unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
1098            DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
1099            if(sA) { \
1100              DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
1101              ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1102              ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
1103            } \
1104            ++srcp; \
1105            ++dstp; \
1106            widthvar--; \
1107        }
1108        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1109        if (width > 0) {
1110            /* vsrcPermute */
1111            /* vdstPermute */
1112            int extrawidth = (width % 4);
1113            vector unsigned char valigner = VEC_ALIGNER(srcp);
1114            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1115            width -= extrawidth;
1116            while (width) {
1117                vector unsigned char voverflow;
1118                vector unsigned char vd;
1119                vector unsigned char valpha;
1120                vector unsigned char vdstalpha;
1121                /* s = *srcp */
1122                voverflow = (vector unsigned char)vec_ld(15, srcp);
1123                vs = vec_perm(vs, voverflow, valigner);
1124                vs = vec_perm(vs, v0, vsrcPermute);
1125
1126                valpha = vec_perm(vs, v0, valphaPermute);
1127
1128                /* d = *dstp */
1129                vd = (vector unsigned char)vec_ld(0, dstp);
1130                vd = vec_perm(vd, v0, vsdstPermute);
1131                vdstalpha = vec_and(vd, valphamask);
1132
1133                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1134
1135                /* set the alpha to the dest alpha */
1136                vd = vec_and(vd, vpixelmask);
1137                vd = vec_or(vd, vdstalpha);
1138                vd = vec_perm(vd, v0, vdstPermute);
1139
1140                /* *dstp = res */
1141                vec_st((vector unsigned int)vd, 0, dstp);
1142
1143                srcp += 4;
1144                dstp += 4;
1145                width -= 4;
1146                vs = voverflow;
1147
1148            }
1149            ONE_PIXEL_BLEND((extrawidth), extrawidth);
1150        }
1151	    srcp += srcskip;
1152	    dstp += dstskip;
1153#undef ONE_PIXEL_BLEND
1154	}
1155}
1156
1157/* fast ARGB888->(A)RGB888 blending with pixel alpha */
1158static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
1159{
1160	int width = info->d_width;
1161	int height = info->d_height;
1162	Uint32 *srcp = (Uint32 *)info->s_pixels;
1163	int srcskip = info->s_skip >> 2;
1164	Uint32 *dstp = (Uint32 *)info->d_pixels;
1165	int dstskip = info->d_skip >> 2;
1166    vector unsigned char mergePermute;
1167    vector unsigned char valphaPermute;
1168    vector unsigned char valphamask;
1169    vector unsigned char vpixelmask;
1170    vector unsigned char v0;
1171    vector unsigned short v1;
1172    vector unsigned short v8;
1173    v0 = vec_splat_u8(0);
1174    v1 = vec_splat_u16(1);
1175    v8 = vec_splat_u16(8);
1176    mergePermute = VEC_MERGE_PERMUTE();
1177    valphamask = VEC_ALPHA_MASK();
1178    valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1179
1180
1181    vpixelmask = vec_nor(valphamask, v0);
1182	while(height--) {
1183        width = info->d_width;
1184#define ONE_PIXEL_BLEND(condition, widthvar) \
1185        while ((condition)) { \
1186            Uint32 dalpha; \
1187            Uint32 d; \
1188            Uint32 s1; \
1189            Uint32 d1; \
1190            Uint32 s = *srcp; \
1191            Uint32 alpha = s >> 24; \
1192            if(alpha) { \
1193              if(alpha == SDL_ALPHA_OPAQUE) { \
1194                *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
1195              } else { \
1196                d = *dstp; \
1197                dalpha = d & 0xff000000; \
1198                s1 = s & 0xff00ff; \
1199                d1 = d & 0xff00ff; \
1200                d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
1201                s &= 0xff00; \
1202                d &= 0xff00; \
1203                d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1204                *dstp = d1 | d | dalpha; \
1205              } \
1206            } \
1207            ++srcp; \
1208            ++dstp; \
1209            widthvar--; \
1210	    }
1211        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1212        if (width > 0) {
1213            int extrawidth = (width % 4);
1214            vector unsigned char valigner = VEC_ALIGNER(srcp);
1215            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1216            width -= extrawidth;
1217            while (width) {
1218                vector unsigned char voverflow;
1219                vector unsigned char vd;
1220                vector unsigned char valpha;
1221                vector unsigned char vdstalpha;
1222                /* s = *srcp */
1223                voverflow = (vector unsigned char)vec_ld(15, srcp);
1224                vs = vec_perm(vs, voverflow, valigner);
1225
1226                valpha = vec_perm(vs, v0, valphaPermute);
1227
1228                /* d = *dstp */
1229                vd = (vector unsigned char)vec_ld(0, dstp);
1230                vdstalpha = vec_and(vd, valphamask);
1231
1232                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1233
1234                /* set the alpha to the dest alpha */
1235                vd = vec_and(vd, vpixelmask);
1236                vd = vec_or(vd, vdstalpha);
1237
1238                /* *dstp = res */
1239                vec_st((vector unsigned int)vd, 0, dstp);
1240
1241                srcp += 4;
1242                dstp += 4;
1243                width -= 4;
1244                vs = voverflow;
1245            }
1246            ONE_PIXEL_BLEND((extrawidth), extrawidth);
1247        }
1248	    srcp += srcskip;
1249	    dstp += dstskip;
1250	}
1251#undef ONE_PIXEL_BLEND
1252}
1253
1254static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
1255{
1256    /* XXX : 6 */
1257	unsigned alpha = info->src->alpha;
1258    int height = info->d_height;
1259    Uint32 *srcp = (Uint32 *)info->s_pixels;
1260    int srcskip = info->s_skip >> 2;
1261    Uint32 *dstp = (Uint32 *)info->d_pixels;
1262    int dstskip = info->d_skip >> 2;
1263    SDL_PixelFormat *srcfmt = info->src;
1264    SDL_PixelFormat *dstfmt = info->dst;
1265	unsigned sA = srcfmt->alpha;
1266	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
1267    vector unsigned char mergePermute;
1268    vector unsigned char vsrcPermute;
1269    vector unsigned char vdstPermute;
1270    vector unsigned char vsdstPermute;
1271    vector unsigned char valpha;
1272    vector unsigned char valphamask;
1273    vector unsigned char vbits;
1274    vector unsigned short v1;
1275    vector unsigned short v8;
1276
1277    mergePermute = VEC_MERGE_PERMUTE();
1278    v1 = vec_splat_u16(1);
1279    v8 = vec_splat_u16(8);
1280
1281    /* set the alpha to 255 on the destination surf */
1282    valphamask = VEC_ALPHA_MASK();
1283
1284    vsrcPermute = calc_swizzle32(srcfmt, NULL);
1285    vdstPermute = calc_swizzle32(NULL, dstfmt);
1286    vsdstPermute = calc_swizzle32(dstfmt, NULL);
1287
1288    /* set a vector full of alpha and 255-alpha */
1289    ((unsigned char *)&valpha)[0] = alpha;
1290    valpha = vec_splat(valpha, 0);
1291    vbits = (vector unsigned char)vec_splat_s8(-1);
1292
1293    while(height--) {
1294        int width = info->d_width;
1295#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1296            Uint32 Pixel; \
1297            unsigned sR, sG, sB, dR, dG, dB; \
1298            DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
1299            DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1300            ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1301            ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1302            ++srcp; \
1303            ++dstp; \
1304            widthvar--; \
1305        }
1306        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1307        if (width > 0) {
1308            int extrawidth = (width % 4);
1309            vector unsigned char valigner = VEC_ALIGNER(srcp);
1310            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1311            width -= extrawidth;
1312            while (width) {
1313                vector unsigned char voverflow;
1314                vector unsigned char vd;
1315
1316                /* s = *srcp */
1317                voverflow = (vector unsigned char)vec_ld(15, srcp);
1318                vs = vec_perm(vs, voverflow, valigner);
1319                vs = vec_perm(vs, valpha, vsrcPermute);
1320
1321                /* d = *dstp */
1322                vd = (vector unsigned char)vec_ld(0, dstp);
1323                vd = vec_perm(vd, vd, vsdstPermute);
1324
1325                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1326
1327                /* set the alpha channel to full on */
1328                vd = vec_or(vd, valphamask);
1329                vd = vec_perm(vd, vbits, vdstPermute);
1330
1331                /* *dstp = res */
1332                vec_st((vector unsigned int)vd, 0, dstp);
1333
1334                srcp += 4;
1335                dstp += 4;
1336                width -= 4;
1337                vs = voverflow;
1338            }
1339            ONE_PIXEL_BLEND((extrawidth), extrawidth);
1340        }
1341#undef ONE_PIXEL_BLEND
1342
1343        srcp += srcskip;
1344        dstp += dstskip;
1345    }
1346
1347}
1348
1349
1350/* fast RGB888->(A)RGB888 blending */
1351static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
1352{
1353	unsigned alpha = info->src->alpha;
1354    int height = info->d_height;
1355    Uint32 *srcp = (Uint32 *)info->s_pixels;
1356    int srcskip = info->s_skip >> 2;
1357    Uint32 *dstp = (Uint32 *)info->d_pixels;
1358    int dstskip = info->d_skip >> 2;
1359    vector unsigned char mergePermute;
1360    vector unsigned char valpha;
1361    vector unsigned char valphamask;
1362    vector unsigned short v1;
1363    vector unsigned short v8;
1364
1365    mergePermute = VEC_MERGE_PERMUTE();
1366    v1 = vec_splat_u16(1);
1367    v8 = vec_splat_u16(8);
1368
1369    /* set the alpha to 255 on the destination surf */
1370    valphamask = VEC_ALPHA_MASK();
1371
1372    /* set a vector full of alpha and 255-alpha */
1373    ((unsigned char *)&valpha)[0] = alpha;
1374    valpha = vec_splat(valpha, 0);
1375
1376    while(height--) {
1377        int width = info->d_width;
1378#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1379            Uint32 s = *srcp; \
1380            Uint32 d = *dstp; \
1381            Uint32 s1 = s & 0xff00ff; \
1382            Uint32 d1 = d & 0xff00ff; \
1383            d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
1384                 & 0xff00ff; \
1385            s &= 0xff00; \
1386            d &= 0xff00; \
1387            d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1388            *dstp = d1 | d | 0xff000000; \
1389            ++srcp; \
1390            ++dstp; \
1391            widthvar--; \
1392        }
1393        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1394        if (width > 0) {
1395            int extrawidth = (width % 4);
1396            vector unsigned char valigner = VEC_ALIGNER(srcp);
1397            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1398            width -= extrawidth;
1399            while (width) {
1400                vector unsigned char voverflow;
1401                vector unsigned char vd;
1402
1403                /* s = *srcp */
1404                voverflow = (vector unsigned char)vec_ld(15, srcp);
1405                vs = vec_perm(vs, voverflow, valigner);
1406
1407                /* d = *dstp */
1408                vd = (vector unsigned char)vec_ld(0, dstp);
1409
1410                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1411
1412                /* set the alpha channel to full on */
1413                vd = vec_or(vd, valphamask);
1414
1415                /* *dstp = res */
1416                vec_st((vector unsigned int)vd, 0, dstp);
1417
1418                srcp += 4;
1419                dstp += 4;
1420                width -= 4;
1421                vs = voverflow;
1422            }
1423            ONE_PIXEL_BLEND((extrawidth), extrawidth);
1424        }
1425#undef ONE_PIXEL_BLEND
1426
1427        srcp += srcskip;
1428        dstp += dstskip;
1429    }
1430}
1431#if __MWERKS__
1432#pragma altivec_model off
1433#endif
1434#endif /* SDL_ALTIVEC_BLITTERS */
1435
1436/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
1437static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
1438{
1439	int width = info->d_width;
1440	int height = info->d_height;
1441	Uint32 *srcp = (Uint32 *)info->s_pixels;
1442	int srcskip = info->s_skip >> 2;
1443	Uint32 *dstp = (Uint32 *)info->d_pixels;
1444	int dstskip = info->d_skip >> 2;
1445
1446	while(height--) {
1447	    DUFFS_LOOP4({
1448		    Uint32 s = *srcp++;
1449		    Uint32 d = *dstp;
1450		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
1451			       + (s & d & 0x00010101)) | 0xff000000;
1452	    }, width);
1453	    srcp += srcskip;
1454	    dstp += dstskip;
1455	}
1456}
1457
1458/* fast RGB888->(A)RGB888 blending with surface alpha */
1459static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
1460{
1461	unsigned alpha = info->src->alpha;
1462	if(alpha == 128) {
1463		BlitRGBtoRGBSurfaceAlpha128(info);
1464	} else {
1465		int width = info->d_width;
1466		int height = info->d_height;
1467		Uint32 *srcp = (Uint32 *)info->s_pixels;
1468		int srcskip = info->s_skip >> 2;
1469		Uint32 *dstp = (Uint32 *)info->d_pixels;
1470		int dstskip = info->d_skip >> 2;
1471		Uint32 s;
1472		Uint32 d;
1473		Uint32 s1;
1474		Uint32 d1;
1475
1476		while(height--) {
1477			DUFFS_LOOP_DOUBLE2({
1478				/* One Pixel Blend */
1479				s = *srcp;
1480				d = *dstp;
1481				s1 = s & 0xff00ff;
1482				d1 = d & 0xff00ff;
1483				d1 = (d1 + ((s1 - d1) * alpha >> 8))
1484				     & 0xff00ff;
1485				s &= 0xff00;
1486				d &= 0xff00;
1487				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1488				*dstp = d1 | d | 0xff000000;
1489				++srcp;
1490				++dstp;
1491			},{
1492			        /* Two Pixels Blend */
1493				s = *srcp;
1494				d = *dstp;
1495				s1 = s & 0xff00ff;
1496				d1 = d & 0xff00ff;
1497				d1 += (s1 - d1) * alpha >> 8;
1498				d1 &= 0xff00ff;
1499
1500				s = ((s & 0xff00) >> 8) |
1501					((srcp[1] & 0xff00) << 8);
1502				d = ((d & 0xff00) >> 8) |
1503					((dstp[1] & 0xff00) << 8);
1504				d += (s - d) * alpha >> 8;
1505				d &= 0x00ff00ff;
1506
1507				*dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
1508				++srcp;
1509
1510			        s1 = *srcp;
1511				d1 = *dstp;
1512				s1 &= 0xff00ff;
1513				d1 &= 0xff00ff;
1514				d1 += (s1 - d1) * alpha >> 8;
1515				d1 &= 0xff00ff;
1516
1517				*dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
1518				++srcp;
1519				++dstp;
1520			}, width);
1521			srcp += srcskip;
1522			dstp += dstskip;
1523		}
1524	}
1525}
1526
1527/* fast ARGB888->(A)RGB888 blending with pixel alpha */
1528static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
1529{
1530	int width = info->d_width;
1531	int height = info->d_height;
1532	Uint32 *srcp = (Uint32 *)info->s_pixels;
1533	int srcskip = info->s_skip >> 2;
1534	Uint32 *dstp = (Uint32 *)info->d_pixels;
1535	int dstskip = info->d_skip >> 2;
1536
1537	while(height--) {
1538	    DUFFS_LOOP4({
1539		Uint32 dalpha;
1540		Uint32 d;
1541		Uint32 s1;
1542		Uint32 d1;
1543		Uint32 s = *srcp;
1544		Uint32 alpha = s >> 24;
1545		/* FIXME: Here we special-case opaque alpha since the
1546		   compositioning used (>>8 instead of /255) doesn't handle
1547		   it correctly. Also special-case alpha=0 for speed?
1548		   Benchmark this! */
1549		if(alpha) {
1550		  if(alpha == SDL_ALPHA_OPAQUE) {
1551		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
1552		  } else {
1553		    /*
1554		     * take out the middle component (green), and process
1555		     * the other two in parallel. One multiply less.
1556		     */
1557		    d = *dstp;
1558		    dalpha = d & 0xff000000;
1559		    s1 = s & 0xff00ff;
1560		    d1 = d & 0xff00ff;
1561		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
1562		    s &= 0xff00;
1563		    d &= 0xff00;
1564		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1565		    *dstp = d1 | d | dalpha;
1566		  }
1567		}
1568		++srcp;
1569		++dstp;
1570	    }, width);
1571	    srcp += srcskip;
1572	    dstp += dstskip;
1573	}
1574}
1575
1576#if GCC_ASMBLIT
1577/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1578static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1579{
1580	int width = info->d_width;
1581	int height = info->d_height;
1582	Uint32 *srcp = (Uint32 *)info->s_pixels;
1583	int srcskip = info->s_skip >> 2;
1584	Uint32 *dstp = (Uint32 *)info->d_pixels;
1585	int dstskip = info->d_skip >> 2;
1586	SDL_PixelFormat* sf = info->src;
1587	Uint32 amask = sf->Amask;
1588
1589	__asm__ (
1590	/* make mm6 all zeros. */
1591	"pxor       %%mm6, %%mm6\n"
1592
1593	/* Make a mask to preserve the alpha. */
1594	"movd      %0, %%mm7\n\t"           /* 0000F000 -> mm7 */
1595	"punpcklbw %%mm7, %%mm7\n\t"        /* FF000000 -> mm7 */
1596	"pcmpeqb   %%mm4, %%mm4\n\t"        /* FFFFFFFF -> mm4 */
1597	"movq      %%mm4, %%mm3\n\t"        /* FFFFFFFF -> mm3 (for later) */
1598	"pxor      %%mm4, %%mm7\n\t"        /* 00FFFFFF -> mm7 (mult mask) */
1599
1600	/* form channel masks */
1601	"movq      %%mm7, %%mm4\n\t"        /* 00FFFFFF -> mm4 */
1602	"packsswb  %%mm6, %%mm4\n\t"        /* 00000FFF -> mm4 (channel mask) */
1603	"packsswb  %%mm6, %%mm3\n\t"        /* 0000FFFF -> mm3 */
1604	"pxor      %%mm4, %%mm3\n\t"        /* 0000F000 -> mm3 (~channel mask) */
1605
1606	/* get alpha channel shift */
1607	"movd      %1, %%mm5\n\t" /* Ashift -> mm5 */
1608
1609	  : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
1610
1611	while(height--) {
1612
1613	    DUFFS_LOOP4({
1614		Uint32 alpha;
1615
1616		__asm__ (
1617		"prefetch 64(%0)\n"
1618		"prefetch 64(%1)\n"
1619			: : "r" (srcp), "r" (dstp) );
1620
1621		alpha = *srcp & amask;
1622		/* FIXME: Here we special-case opaque alpha since the
1623		   compositioning used (>>8 instead of /255) doesn't handle
1624		   it correctly. Also special-case alpha=0 for speed?
1625		   Benchmark this! */
1626		if(alpha == 0) {
1627		    /* do nothing */
1628		}
1629		else if(alpha == amask) {
1630			/* opaque alpha -- copy RGB, keep dst alpha */
1631		    /* using MMX here to free up regular registers for other things */
1632			    __asm__ (
1633		    "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
1634		    "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
1635		    "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
1636		    "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
1637		    "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
1638		    "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
1639
1640		     : : "r" (srcp), "r" (dstp) );
1641		}
1642
1643		else {
1644			    __asm__ (
1645		    /* load in the source, and dst. */
1646		    "movd      (%0), %%mm0\n"		    /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
1647		    "movd      (%1), %%mm1\n"		    /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
1648
1649		    /* Move the src alpha into mm2 */
1650
1651		    /* if supporting pshufw */
1652		    /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
1653		    /*"psrlw     $8, %%mm2\n" */
1654
1655		    /* else: */
1656		    "movd       %2,    %%mm2\n"
1657		    "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
1658		    "punpcklwd	%%mm2, %%mm2\n"	            /* mm2 = 0 0 0 0 |  0 As  0  As */
1659		    "punpckldq	%%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
1660		    "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
1661
1662		    /* move the colors into words. */
1663		    "punpcklbw %%mm6, %%mm0\n"		    /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
1664		    "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
1665
1666		    /* src - dst */
1667		    "psubw    %%mm1, %%mm0\n"		    /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
1668
1669		    /* A * (src-dst) */
1670		    "pmullw    %%mm2, %%mm0\n"		    /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
1671		    "psrlw     $8,    %%mm0\n"		    /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
1672		    "paddb     %%mm1, %%mm0\n"		    /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
1673
1674		    "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
1675
1676		    "movd      %%mm0, (%1)\n"               /* result in mm0 */
1677
1678		     : : "r" (srcp), "r" (dstp), "r" (alpha) );
1679
1680		}
1681		++srcp;
1682		++dstp;
1683	    }, width);
1684	    srcp += srcskip;
1685	    dstp += dstskip;
1686	}
1687
1688	__asm__ (
1689	"emms\n"
1690		:   );
1691}
1692/* End GCC_ASMBLIT*/
1693
1694#elif MSVC_ASMBLIT
1695/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1696static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1697{
1698	int width = info->d_width;
1699	int height = info->d_height;
1700	Uint32 *srcp = (Uint32 *)info->s_pixels;
1701	int srcskip = info->s_skip >> 2;
1702	Uint32 *dstp = (Uint32 *)info->d_pixels;
1703	int dstskip = info->d_skip >> 2;
1704	SDL_PixelFormat* sf = info->src;
1705	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
1706	Uint32 amask = sf->Amask;
1707	Uint32 ashift = sf->Ashift;
1708	Uint64 multmask;
1709
1710	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
1711
1712	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1713	multmask = ~(0xFFFFi64 << (ashift * 2));
1714	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
1715
1716	while(height--) {
1717	    DUFFS_LOOP4({
1718		Uint32 alpha;
1719
1720		_m_prefetch(srcp + 16);
1721		_m_prefetch(dstp + 16);
1722
1723		alpha = *srcp & amask;
1724		if (alpha == 0) {
1725			/* do nothing */
1726		} else if (alpha == amask) {
1727			/* copy RGB, keep dst alpha */
1728			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
1729		} else {
1730			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
1731			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
1732
1733			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
1734			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
1735
1736			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
1737			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
1738			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
1739			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
1740			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
1741
1742			/* blend */
1743			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
1744			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
1745			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
1746			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
1747			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
1748
1749			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
1750		}
1751		++srcp;
1752		++dstp;
1753	    }, width);
1754	    srcp += srcskip;
1755	    dstp += dstskip;
1756	}
1757	_mm_empty();
1758}
1759/* End MSVC_ASMBLIT */
1760
1761#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
1762
1763/* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
1764
1765/* blend a single 16 bit pixel at 50% */
1766#define BLEND16_50(d, s, mask)						\
1767	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
1768
1769/* blend two 16 bit pixels at 50% */
1770#define BLEND2x16_50(d, s, mask)					     \
1771	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
1772	 + (s & d & (~(mask | mask << 16))))
1773
1774static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
1775{
1776	int width = info->d_width;
1777	int height = info->d_height;
1778	Uint16 *srcp = (Uint16 *)info->s_pixels;
1779	int srcskip = info->s_skip >> 1;
1780	Uint16 *dstp = (Uint16 *)info->d_pixels;
1781	int dstskip = info->d_skip >> 1;
1782
1783	while(height--) {
1784		if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
1785			/*
1786			 * Source and destination not aligned, pipeline it.
1787			 * This is mostly a win for big blits but no loss for
1788			 * small ones
1789			 */
1790			Uint32 prev_sw;
1791			int w = width;
1792
1793			/* handle odd destination */
1794			if((uintptr_t)dstp & 2) {
1795				Uint16 d = *dstp, s = *srcp;
1796				*dstp = BLEND16_50(d, s, mask);
1797				dstp++;
1798				srcp++;
1799				w--;
1800			}
1801			srcp++;	/* srcp is now 32-bit aligned */
1802
1803			/* bootstrap pipeline with first halfword */
1804			prev_sw = ((Uint32 *)srcp)[-1];
1805
1806			while(w > 1) {
1807				Uint32 sw, dw, s;
1808				sw = *(Uint32 *)srcp;
1809				dw = *(Uint32 *)dstp;
1810#if SDL_BYTEORDER == SDL_BIG_ENDIAN
1811				s = (prev_sw << 16) + (sw >> 16);
1812#else
1813				s = (prev_sw >> 16) + (sw << 16);
1814#endif
1815				prev_sw = sw;
1816				*(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
1817				dstp += 2;
1818				srcp += 2;
1819				w -= 2;
1820			}
1821
1822			/* final pixel if any */
1823			if(w) {
1824				Uint16 d = *dstp, s;
1825#if SDL_BYTEORDER == SDL_BIG_ENDIAN
1826				s = (Uint16)prev_sw;
1827#else
1828				s = (Uint16)(prev_sw >> 16);
1829#endif
1830				*dstp = BLEND16_50(d, s, mask);
1831				srcp++;
1832				dstp++;
1833			}
1834			srcp += srcskip - 1;
1835			dstp += dstskip;
1836		} else {
1837			/* source and destination are aligned */
1838			int w = width;
1839
1840			/* first odd pixel? */
1841			if((uintptr_t)srcp & 2) {
1842				Uint16 d = *dstp, s = *srcp;
1843				*dstp = BLEND16_50(d, s, mask);
1844				srcp++;
1845				dstp++;
1846				w--;
1847			}
1848			/* srcp and dstp are now 32-bit aligned */
1849
1850			while(w > 1) {
1851				Uint32 sw = *(Uint32 *)srcp;
1852				Uint32 dw = *(Uint32 *)dstp;
1853				*(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
1854				srcp += 2;
1855				dstp += 2;
1856				w -= 2;
1857			}
1858
1859			/* last odd pixel? */
1860			if(w) {
1861				Uint16 d = *dstp, s = *srcp;
1862				*dstp = BLEND16_50(d, s, mask);
1863				srcp++;
1864				dstp++;
1865			}
1866			srcp += srcskip;
1867			dstp += dstskip;
1868		}
1869	}
1870}
1871
1872#if GCC_ASMBLIT
1873/* fast RGB565->RGB565 blending with surface alpha */
1874static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
1875{
1876	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
1877	if(alpha == 128) {
1878		Blit16to16SurfaceAlpha128(info, 0xf7de);
1879	} else {
1880		int width = info->d_width;
1881		int height = info->d_height;
1882		Uint16 *srcp = (Uint16 *)info->s_pixels;
1883		int srcskip = info->s_skip >> 1;
1884		Uint16 *dstp = (Uint16 *)info->d_pixels;
1885		int dstskip = info->d_skip >> 1;
1886		Uint32 s, d;
1887		Uint64 load;
1888
1889		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
1890		load = alpha;
1891		alpha >>= 3;		/* downscale alpha to 5 bits */
1892
1893		movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
1894		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
1895		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
1896		/* position alpha to allow for mullo and mulhi on diff channels
1897		   to reduce the number of operations */
1898		psllq_i2r(3, mm0);
1899
1900		/* Setup the 565 color channel masks */
1901		load = 0x07E007E007E007E0ULL;
1902		movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
1903		load = 0x001F001F001F001FULL;
1904		movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
1905		while(height--) {
1906			DUFFS_LOOP_QUATRO2(
1907			{
1908				s = *srcp++;
1909				d = *dstp;
1910				/*
1911				 * shift out the middle component (green) to
1912				 * the high 16 bits, and process all three RGB
1913				 * components at the same time.
1914				 */
1915				s = (s | s << 16) & 0x07e0f81f;
1916				d = (d | d << 16) & 0x07e0f81f;
1917				d += (s - d) * alpha >> 5;
1918				d &= 0x07e0f81f;
1919				*dstp++ = d | d >> 16;
1920			},{
1921				s = *srcp++;
1922				d = *dstp;
1923				/*
1924				 * shift out the middle component (green) to
1925				 * the high 16 bits, and process all three RGB
1926				 * components at the same time.
1927				 */
1928				s = (s | s << 16) & 0x07e0f81f;
1929				d = (d | d << 16) & 0x07e0f81f;
1930				d += (s - d) * alpha >> 5;
1931				d &= 0x07e0f81f;
1932				*dstp++ = d | d >> 16;
1933				s = *srcp++;
1934				d = *dstp;
1935				/*
1936				 * shift out the middle component (green) to
1937				 * the high 16 bits, and process all three RGB
1938				 * components at the same time.
1939				 */
1940				s = (s | s << 16) & 0x07e0f81f;
1941				d = (d | d << 16) & 0x07e0f81f;
1942				d += (s - d) * alpha >> 5;
1943				d &= 0x07e0f81f;
1944				*dstp++ = d | d >> 16;
1945			},{
1946				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
1947				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
1948
1949				/* red -- does not need a mask since the right shift clears
1950				   the uninteresting bits */
1951				movq_r2r(mm2, mm5); /* src -> mm5 */
1952				movq_r2r(mm3, mm6); /* dst -> mm6 */
1953				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
1954				psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
1955
1956				/* blend */
1957				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1958				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1959				/* alpha used is actually 11 bits
1960				   11 + 5 = 16 bits, so the sign bits are lost */
1961				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
1962				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1963				psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
1964
1965				movq_r2r(mm6, mm1); /* save new reds in dsts */
1966
1967				/* green -- process the bits in place */
1968				movq_r2r(mm2, mm5); /* src -> mm5 */
1969				movq_r2r(mm3, mm6); /* dst -> mm6 */
1970				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
1971				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
1972
1973				/* blend */
1974				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1975				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1976				/* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
1977				   bits are gone and the sign bits present */
1978				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
1979				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1980
1981				por_r2r(mm6, mm1); /* save new greens in dsts */
1982
1983				/* blue */
1984				movq_r2r(mm2, mm5); /* src -> mm5 */
1985				movq_r2r(mm3, mm6); /* dst -> mm6 */
1986				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
1987				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
1988
1989				/* blend */
1990				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1991				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1992				/* 11 + 5 = 16 bits, so the sign bits are lost and
1993				   the interesting bits will need to be MASKed */
1994				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
1995				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1996				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
1997
1998				por_r2r(mm6, mm1); /* save new blues in dsts */
1999
2000				movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
2001
2002				srcp += 4;
2003				dstp += 4;
2004			}, width);
2005			srcp += srcskip;
2006			dstp += dstskip;
2007		}
2008		emms();
2009	}
2010}
2011
2012/* fast RGB555->RGB555 blending with surface alpha */
2013static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2014{
2015	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2016	if(alpha == 128) {
2017		Blit16to16SurfaceAlpha128(info, 0xfbde);
2018	} else {
2019		int width = info->d_width;
2020		int height = info->d_height;
2021		Uint16 *srcp = (Uint16 *)info->s_pixels;
2022		int srcskip = info->s_skip >> 1;
2023		Uint16 *dstp = (Uint16 *)info->d_pixels;
2024		int dstskip = info->d_skip >> 1;
2025		Uint32 s, d;
2026		Uint64 load;
2027
2028		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
2029		load = alpha;
2030		alpha >>= 3;		/* downscale alpha to 5 bits */
2031
2032		movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
2033		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
2034		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
2035		/* position alpha to allow for mullo and mulhi on diff channels
2036		   to reduce the number of operations */
2037		psllq_i2r(3, mm0);
2038
2039		/* Setup the 555 color channel masks */
2040		load = 0x03E003E003E003E0ULL;
2041		movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
2042		load = 0x001F001F001F001FULL;
2043		movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
2044		while(height--) {
2045			DUFFS_LOOP_QUATRO2(
2046			{
2047				s = *srcp++;
2048				d = *dstp;
2049				/*
2050				 * shift out the middle component (green) to
2051				 * the high 16 bits, and process all three RGB
2052				 * components at the same time.
2053				 */
2054				s = (s | s << 16) & 0x03e07c1f;
2055				d = (d | d << 16) & 0x03e07c1f;
2056				d += (s - d) * alpha >> 5;
2057				d &= 0x03e07c1f;
2058				*dstp++ = d | d >> 16;
2059			},{
2060				s = *srcp++;
2061				d = *dstp;
2062				/*
2063				 * shift out the middle component (green) to
2064				 * the high 16 bits, and process all three RGB
2065				 * components at the same time.
2066				 */
2067				s = (s | s << 16) & 0x03e07c1f;
2068				d = (d | d << 16) & 0x03e07c1f;
2069				d += (s - d) * alpha >> 5;
2070				d &= 0x03e07c1f;
2071				*dstp++ = d | d >> 16;
2072			        s = *srcp++;
2073				d = *dstp;
2074				/*
2075				 * shift out the middle component (green) to
2076				 * the high 16 bits, and process all three RGB
2077				 * components at the same time.
2078				 */
2079				s = (s | s << 16) & 0x03e07c1f;
2080				d = (d | d << 16) & 0x03e07c1f;
2081				d += (s - d) * alpha >> 5;
2082				d &= 0x03e07c1f;
2083				*dstp++ = d | d >> 16;
2084			},{
2085				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
2086				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
2087
2088				/* red -- process the bits in place */
2089				psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
2090					/* by reusing the GREEN mask we free up another mmx
2091					   register to accumulate the result */
2092
2093				movq_r2r(mm2, mm5); /* src -> mm5 */
2094				movq_r2r(mm3, mm6); /* dst -> mm6 */
2095				pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
2096				pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
2097
2098				/* blend */
2099				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2100				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2101				/* 11 + 15 - 16 = 10 bits, uninteresting bits will be
2102				   cleared by a MASK below */
2103				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2104				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2105				pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
2106
2107				psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
2108
2109				movq_r2r(mm6, mm1); /* save new reds in dsts */
2110
2111				/* green -- process the bits in place */
2112				movq_r2r(mm2, mm5); /* src -> mm5 */
2113				movq_r2r(mm3, mm6); /* dst -> mm6 */
2114				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2115				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2116
2117				/* blend */
2118				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2119				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2120				/* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
2121				   bits are gone and the sign bits present */
2122				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2123				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2124
2125				por_r2r(mm6, mm1); /* save new greens in dsts */
2126
2127				/* blue */
2128				movq_r2r(mm2, mm5); /* src -> mm5 */
2129				movq_r2r(mm3, mm6); /* dst -> mm6 */
2130				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2131				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2132
2133				/* blend */
2134				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2135				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2136				/* 11 + 5 = 16 bits, so the sign bits are lost and
2137				   the interesting bits will need to be MASKed */
2138				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2139				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2140				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2141
2142				por_r2r(mm6, mm1); /* save new blues in dsts */
2143
2144				movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
2145
2146				srcp += 4;
2147				dstp += 4;
2148			}, width);
2149			srcp += srcskip;
2150			dstp += dstskip;
2151		}
2152		emms();
2153	}
2154}
2155/* End GCC_ASMBLIT */
2156
2157#elif MSVC_ASMBLIT
2158/* fast RGB565->RGB565 blending with surface alpha */
2159static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
2160{
2161	unsigned alpha = info->src->alpha;
2162	if(alpha == 128) {
2163		Blit16to16SurfaceAlpha128(info, 0xf7de);
2164	} else {
2165		int width = info->d_width;
2166		int height = info->d_height;
2167		Uint16 *srcp = (Uint16 *)info->s_pixels;
2168		int srcskip = info->s_skip >> 1;
2169		Uint16 *dstp = (Uint16 *)info->d_pixels;
2170		int dstskip = info->d_skip >> 1;
2171		Uint32 s, d;
2172
2173		__m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
2174
2175		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
2176		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2177		alpha >>= 3;		/* downscale alpha to 5 bits */
2178
2179		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2180		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2181		/* position alpha to allow for mullo and mulhi on diff channels
2182		   to reduce the number of operations */
2183		mm_alpha = _mm_slli_si64(mm_alpha, 3);
2184
2185		/* Setup the 565 color channel masks */
2186		gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
2187		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2188
2189		while(height--) {
2190			DUFFS_LOOP_QUATRO2(
2191			{
2192				s = *srcp++;
2193				d = *dstp;
2194				/*
2195				 * shift out the middle component (green) to
2196				 * the high 16 bits, and process all three RGB
2197				 * components at the same time.
2198				 */
2199				s = (s | s << 16) & 0x07e0f81f;
2200				d = (d | d << 16) & 0x07e0f81f;
2201				d += (s - d) * alpha >> 5;
2202				d &= 0x07e0f81f;
2203				*dstp++ = (Uint16)(d | d >> 16);
2204			},{
2205				s = *srcp++;
2206				d = *dstp;
2207				/*
2208				 * shift out the middle component (green) to
2209				 * the high 16 bits, and process all three RGB
2210				 * components at the same time.
2211				 */
2212				s = (s | s << 16) & 0x07e0f81f;
2213				d = (d | d << 16) & 0x07e0f81f;
2214				d += (s - d) * alpha >> 5;
2215				d &= 0x07e0f81f;
2216				*dstp++ = (Uint16)(d | d >> 16);
2217				s = *srcp++;
2218				d = *dstp;
2219				/*
2220				 * shift out the middle component (green) to
2221				 * the high 16 bits, and process all three RGB
2222				 * components at the same time.
2223				 */
2224				s = (s | s << 16) & 0x07e0f81f;
2225				d = (d | d << 16) & 0x07e0f81f;
2226				d += (s - d) * alpha >> 5;
2227				d &= 0x07e0f81f;
2228				*dstp++ = (Uint16)(d | d >> 16);
2229			},{
2230				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2231				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2232
2233				/* red */
2234				src2 = src1;
2235				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
2236
2237				dst2 = dst1;
2238				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
2239
2240				/* blend */
2241				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2242				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2243				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2244				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2245				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
2246
2247				mm_res = dst2; /* RED -> mm_res */
2248
2249				/* green -- process the bits in place */
2250				src2 = src1;
2251				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2252
2253				dst2 = dst1;
2254				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2255
2256				/* blend */
2257				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2258				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2259				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2260				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2261
2262				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2263
2264				/* blue */
2265				src2 = src1;
2266				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2267
2268				dst2 = dst1;
2269				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2270
2271				/* blend */
2272				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2273				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2274				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2275				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2276				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2277
2278				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2279
2280				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2281
2282				srcp += 4;
2283				dstp += 4;
2284			}, width);
2285			srcp += srcskip;
2286			dstp += dstskip;
2287		}
2288		_mm_empty();
2289	}
2290}
2291
2292/* fast RGB555->RGB555 blending with surface alpha */
2293static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2294{
2295	unsigned alpha = info->src->alpha;
2296	if(alpha == 128) {
2297		Blit16to16SurfaceAlpha128(info, 0xfbde);
2298	} else {
2299		int width = info->d_width;
2300		int height = info->d_height;
2301		Uint16 *srcp = (Uint16 *)info->s_pixels;
2302		int srcskip = info->s_skip >> 1;
2303		Uint16 *dstp = (Uint16 *)info->d_pixels;
2304		int dstskip = info->d_skip >> 1;
2305		Uint32 s, d;
2306
2307		__m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
2308
2309		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
2310		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2311		alpha >>= 3;		/* downscale alpha to 5 bits */
2312
2313		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2314		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2315		/* position alpha to allow for mullo and mulhi on diff channels
2316		   to reduce the number of operations */
2317		mm_alpha = _mm_slli_si64(mm_alpha, 3);
2318
2319		/* Setup the 555 color channel masks */
2320		rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
2321		gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
2322		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2323
2324		while(height--) {
2325			DUFFS_LOOP_QUATRO2(
2326			{
2327				s = *srcp++;
2328				d = *dstp;
2329				/*
2330				 * shift out the middle component (green) to
2331				 * the high 16 bits, and process all three RGB
2332				 * components at the same time.
2333				 */
2334				s = (s | s << 16) & 0x03e07c1f;
2335				d = (d | d << 16) & 0x03e07c1f;
2336				d += (s - d) * alpha >> 5;
2337				d &= 0x03e07c1f;
2338				*dstp++ = (Uint16)(d | d >> 16);
2339			},{
2340				s = *srcp++;
2341				d = *dstp;
2342				/*
2343				 * shift out the middle component (green) to
2344				 * the high 16 bits, and process all three RGB
2345				 * components at the same time.
2346				 */
2347				s = (s | s << 16) & 0x03e07c1f;
2348				d = (d | d << 16) & 0x03e07c1f;
2349				d += (s - d) * alpha >> 5;
2350				d &= 0x03e07c1f;
2351				*dstp++ = (Uint16)(d | d >> 16);
2352			        s = *srcp++;
2353				d = *dstp;
2354				/*
2355				 * shift out the middle component (green) to
2356				 * the high 16 bits, and process all three RGB
2357				 * components at the same time.
2358				 */
2359				s = (s | s << 16) & 0x03e07c1f;
2360				d = (d | d << 16) & 0x03e07c1f;
2361				d += (s - d) * alpha >> 5;
2362				d &= 0x03e07c1f;
2363				*dstp++ = (Uint16)(d | d >> 16);
2364			},{
2365				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2366				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2367
2368				/* red -- process the bits in place */
2369				src2 = src1;
2370				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
2371
2372				dst2 = dst1;
2373				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
2374
2375				/* blend */
2376				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2377				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2378				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2379				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2380				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
2381
2382				mm_res = dst2; /* RED -> mm_res */
2383
2384				/* green -- process the bits in place */
2385				src2 = src1;
2386				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2387
2388				dst2 = dst1;
2389				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2390
2391				/* blend */
2392				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2393				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2394				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2395				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2396
2397				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2398
2399				/* blue */
2400				src2 = src1; /* src -> src2 */
2401				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2402
2403				dst2 = dst1; /* dst -> dst2 */
2404				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2405
2406				/* blend */
2407				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2408				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2409				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2410				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2411				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2412
2413				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2414
2415				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2416
2417				srcp += 4;
2418				dstp += 4;
2419			}, width);
2420			srcp += srcskip;
2421			dstp += dstskip;
2422		}
2423		_mm_empty();
2424	}
2425}
2426#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
2427
2428/* fast RGB565->RGB565 blending with surface alpha */
2429static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
2430{
2431	unsigned alpha = info->src->alpha;
2432	if(alpha == 128) {
2433		Blit16to16SurfaceAlpha128(info, 0xf7de);
2434	} else {
2435		int width = info->d_width;
2436		int height = info->d_height;
2437		Uint16 *srcp = (Uint16 *)info->s_pixels;
2438		int srcskip = info->s_skip >> 1;
2439		Uint16 *dstp = (Uint16 *)info->d_pixels;
2440		int dstskip = info->d_skip >> 1;
2441		alpha >>= 3;	/* downscale alpha to 5 bits */
2442
2443		while(height--) {
2444			DUFFS_LOOP4({
2445				Uint32 s = *srcp++;
2446				Uint32 d = *dstp;
2447				/*
2448				 * shift out the middle component (green) to
2449				 * the high 16 bits, and process all three RGB
2450				 * components at the same time.
2451				 */
2452				s = (s | s << 16) & 0x07e0f81f;
2453				d = (d | d << 16) & 0x07e0f81f;
2454				d += (s - d) * alpha >> 5;
2455				d &= 0x07e0f81f;
2456				*dstp++ = (Uint16)(d | d >> 16);
2457			}, width);
2458			srcp += srcskip;
2459			dstp += dstskip;
2460		}
2461	}
2462}
2463
2464/* fast RGB555->RGB555 blending with surface alpha */
2465static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
2466{
2467	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2468	if(alpha == 128) {
2469		Blit16to16SurfaceAlpha128(info, 0xfbde);
2470	} else {
2471		int width = info->d_width;
2472		int height = info->d_height;
2473		Uint16 *srcp = (Uint16 *)info->s_pixels;
2474		int srcskip = info->s_skip >> 1;
2475		Uint16 *dstp = (Uint16 *)info->d_pixels;
2476		int dstskip = info->d_skip >> 1;
2477		alpha >>= 3;		/* downscale alpha to 5 bits */
2478
2479		while(height--) {
2480			DUFFS_LOOP4({
2481				Uint32 s = *srcp++;
2482				Uint32 d = *dstp;
2483				/*
2484				 * shift out the middle component (green) to
2485				 * the high 16 bits, and process all three RGB
2486				 * components at the same time.
2487				 */
2488				s = (s | s << 16) & 0x03e07c1f;
2489				d = (d | d << 16) & 0x03e07c1f;
2490				d += (s - d) * alpha >> 5;
2491				d &= 0x03e07c1f;
2492				*dstp++ = (Uint16)(d | d >> 16);
2493			}, width);
2494			srcp += srcskip;
2495			dstp += dstskip;
2496		}
2497	}
2498}
2499
2500/* fast ARGB8888->RGB565 blending with pixel alpha */
2501static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
2502{
2503	int width = info->d_width;
2504	int height = info->d_height;
2505	Uint32 *srcp = (Uint32 *)info->s_pixels;
2506	int srcskip = info->s_skip >> 2;
2507	Uint16 *dstp = (Uint16 *)info->d_pixels;
2508	int dstskip = info->d_skip >> 1;
2509
2510	while(height--) {
2511	    DUFFS_LOOP4({
2512		Uint32 s = *srcp;
2513		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
2514		/* FIXME: Here we special-case opaque alpha since the
2515		   compositioning used (>>8 instead of /255) doesn't handle
2516		   it correctly. Also special-case alpha=0 for speed?
2517		   Benchmark this! */
2518		if(alpha) {
2519		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2520		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
2521		  } else {
2522		    Uint32 d = *dstp;
2523		    /*
2524		     * convert source and destination to G0RAB65565
2525		     * and blend all components at the same time
2526		     */
2527		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
2528		      + (s >> 3 & 0x1f);
2529		    d = (d | d << 16) & 0x07e0f81f;
2530		    d += (s - d) * alpha >> 5;
2531		    d &= 0x07e0f81f;
2532		    *dstp = (Uint16)(d | d >> 16);
2533		  }
2534		}
2535		srcp++;
2536		dstp++;
2537	    }, width);
2538	    srcp += srcskip;
2539	    dstp += dstskip;
2540	}
2541}
2542
2543/* fast ARGB8888->RGB555 blending with pixel alpha */
2544static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
2545{
2546	int width = info->d_width;
2547	int height = info->d_height;
2548	Uint32 *srcp = (Uint32 *)info->s_pixels;
2549	int srcskip = info->s_skip >> 2;
2550	Uint16 *dstp = (Uint16 *)info->d_pixels;
2551	int dstskip = info->d_skip >> 1;
2552
2553	while(height--) {
2554	    DUFFS_LOOP4({
2555		unsigned alpha;
2556		Uint32 s = *srcp;
2557		alpha = s >> 27; /* downscale alpha to 5 bits */
2558		/* FIXME: Here we special-case opaque alpha since the
2559		   compositioning used (>>8 instead of /255) doesn't handle
2560		   it correctly. Also special-case alpha=0 for speed?
2561		   Benchmark this! */
2562		if(alpha) {
2563		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2564		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
2565		  } else {
2566		    Uint32 d = *dstp;
2567		    /*
2568		     * convert source and destination to G0RAB65565
2569		     * and blend all components at the same time
2570		     */
2571		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
2572		      + (s >> 3 & 0x1f);
2573		    d = (d | d << 16) & 0x03e07c1f;
2574		    d += (s - d) * alpha >> 5;
2575		    d &= 0x03e07c1f;
2576		    *dstp = (Uint16)(d | d >> 16);
2577		  }
2578		}
2579		srcp++;
2580		dstp++;
2581	    }, width);
2582	    srcp += srcskip;
2583	    dstp += dstskip;
2584	}
2585}
2586
2587/* General (slow) N->N blending with per-surface alpha */
2588static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
2589{
2590	int width = info->d_width;
2591	int height = info->d_height;
2592	Uint8 *src = info->s_pixels;
2593	int srcskip = info->s_skip;
2594	Uint8 *dst = info->d_pixels;
2595	int dstskip = info->d_skip;
2596	SDL_PixelFormat *srcfmt = info->src;
2597	SDL_PixelFormat *dstfmt = info->dst;
2598	int srcbpp = srcfmt->BytesPerPixel;
2599	int dstbpp = dstfmt->BytesPerPixel;
2600	unsigned sA = srcfmt->alpha;
2601	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2602
2603	if(sA) {
2604	  while ( height-- ) {
2605	    DUFFS_LOOP4(
2606	    {
2607		Uint32 Pixel;
2608		unsigned sR;
2609		unsigned sG;
2610		unsigned sB;
2611		unsigned dR;
2612		unsigned dG;
2613		unsigned dB;
2614		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
2615		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2616		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2617		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2618		src += srcbpp;
2619		dst += dstbpp;
2620	    },
2621	    width);
2622	    src += srcskip;
2623	    dst += dstskip;
2624	  }
2625	}
2626}
2627
2628/* General (slow) colorkeyed N->N blending with per-surface alpha */
2629static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
2630{
2631	int width = info->d_width;
2632	int height = info->d_height;
2633	Uint8 *src = info->s_pixels;
2634	int srcskip = info->s_skip;
2635	Uint8 *dst = info->d_pixels;
2636	int dstskip = info->d_skip;
2637	SDL_PixelFormat *srcfmt = info->src;
2638	SDL_PixelFormat *dstfmt = info->dst;
2639	Uint32 ckey = srcfmt->colorkey;
2640	int srcbpp = srcfmt->BytesPerPixel;
2641	int dstbpp = dstfmt->BytesPerPixel;
2642	unsigned sA = srcfmt->alpha;
2643	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2644
2645	while ( height-- ) {
2646	    DUFFS_LOOP4(
2647	    {
2648		Uint32 Pixel;
2649		unsigned sR;
2650		unsigned sG;
2651		unsigned sB;
2652		unsigned dR;
2653		unsigned dG;
2654		unsigned dB;
2655		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
2656		if(sA && Pixel != ckey) {
2657		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
2658		    DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2659		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2660		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2661		}
2662		src += srcbpp;
2663		dst += dstbpp;
2664	    },
2665	    width);
2666	    src += srcskip;
2667	    dst += dstskip;
2668	}
2669}
2670
2671/* General (slow) N->N blending with pixel alpha */
2672static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
2673{
2674	int width = info->d_width;
2675	int height = info->d_height;
2676	Uint8 *src = info->s_pixels;
2677	int srcskip = info->s_skip;
2678	Uint8 *dst = info->d_pixels;
2679	int dstskip = info->d_skip;
2680	SDL_PixelFormat *srcfmt = info->src;
2681	SDL_PixelFormat *dstfmt = info->dst;
2682
2683	int  srcbpp;
2684	int  dstbpp;
2685
2686	/* Set up some basic variables */
2687	srcbpp = srcfmt->BytesPerPixel;
2688	dstbpp = dstfmt->BytesPerPixel;
2689
2690	/* FIXME: for 8bpp source alpha, this doesn't get opaque values
2691	   quite right. for <8bpp source alpha, it gets them very wrong
2692	   (check all macros!)
2693	   It is unclear whether there is a good general solution that doesn't
2694	   need a branch (or a divide). */
2695	while ( height-- ) {
2696	    DUFFS_LOOP4(
2697	    {
2698		Uint32 Pixel;
2699		unsigned sR;
2700		unsigned sG;
2701		unsigned sB;
2702		unsigned dR;
2703		unsigned dG;
2704		unsigned dB;
2705		unsigned sA;
2706		unsigned dA;
2707		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
2708		if(sA) {
2709		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
2710		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2711		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2712		}
2713		src += srcbpp;
2714		dst += dstbpp;
2715	    },
2716	    width);
2717	    src += srcskip;
2718	    dst += dstskip;
2719	}
2720}
2721
2722
2723SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
2724{
2725    SDL_PixelFormat *sf = surface->format;
2726    SDL_PixelFormat *df = surface->map->dst->format;
2727
2728    if(sf->Amask == 0) {
2729	if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
2730	    if(df->BytesPerPixel == 1)
2731		return BlitNto1SurfaceAlphaKey;
2732	    else
2733#if SDL_ALTIVEC_BLITTERS
2734	if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
2735	    !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2736            return Blit32to32SurfaceAlphaKeyAltivec;
2737        else
2738#endif
2739            return BlitNtoNSurfaceAlphaKey;
2740	} else {
2741	    /* Per-surface alpha blits */
2742	    switch(df->BytesPerPixel) {
2743	    case 1:
2744		return BlitNto1SurfaceAlpha;
2745
2746	    case 2:
2747		if(surface->map->identity) {
2748		    if(df->Gmask == 0x7e0)
2749		    {
2750#if MMX_ASMBLIT
2751		if(SDL_HasMMX())
2752			return Blit565to565SurfaceAlphaMMX;
2753		else
2754#endif
2755			return Blit565to565SurfaceAlpha;
2756		    }
2757		    else if(df->Gmask == 0x3e0)
2758		    {
2759#if MMX_ASMBLIT
2760		if(SDL_HasMMX())
2761			return Blit555to555SurfaceAlphaMMX;
2762		else
2763#endif
2764			return Blit555to555SurfaceAlpha;
2765		    }
2766		}
2767		return BlitNtoNSurfaceAlpha;
2768
2769	    case 4:
2770		if(sf->Rmask == df->Rmask
2771		   && sf->Gmask == df->Gmask
2772		   && sf->Bmask == df->Bmask
2773		   && sf->BytesPerPixel == 4)
2774		{
2775#if MMX_ASMBLIT
2776			if(sf->Rshift % 8 == 0
2777			   && sf->Gshift % 8 == 0
2778			   && sf->Bshift % 8 == 0
2779			   && SDL_HasMMX())
2780			    return BlitRGBtoRGBSurfaceAlphaMMX;
2781#endif
2782			if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
2783			{
2784#if SDL_ALTIVEC_BLITTERS
2785				if(!(surface->map->dst->flags & SDL_HWSURFACE)
2786					&& SDL_HasAltiVec())
2787					return BlitRGBtoRGBSurfaceAlphaAltivec;
2788#endif
2789				return BlitRGBtoRGBSurfaceAlpha;
2790			}
2791		}
2792#if SDL_ALTIVEC_BLITTERS
2793		if((sf->BytesPerPixel == 4) &&
2794		   !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2795			return Blit32to32SurfaceAlphaAltivec;
2796		else
2797#endif
2798			return BlitNtoNSurfaceAlpha;
2799
2800	    case 3:
2801	    default:
2802		return BlitNtoNSurfaceAlpha;
2803	    }
2804	}
2805    } else {
2806	/* Per-pixel alpha blits */
2807	switch(df->BytesPerPixel) {
2808	case 1:
2809	    return BlitNto1PixelAlpha;
2810
2811	case 2:
2812#if SDL_ALTIVEC_BLITTERS
2813	if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
2814           df->Gmask == 0x7e0 &&
2815	   df->Bmask == 0x1f && SDL_HasAltiVec())
2816            return Blit32to565PixelAlphaAltivec;
2817        else
2818#endif
2819	    if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2820	       && sf->Gmask == 0xff00
2821	       && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
2822		   || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
2823		if(df->Gmask == 0x7e0)
2824		    return BlitARGBto565PixelAlpha;
2825		else if(df->Gmask == 0x3e0)
2826		    return BlitARGBto555PixelAlpha;
2827	    }
2828	    return BlitNtoNPixelAlpha;
2829
2830	case 4:
2831	    if(sf->Rmask == df->Rmask
2832	       && sf->Gmask == df->Gmask
2833	       && sf->Bmask == df->Bmask
2834	       && sf->BytesPerPixel == 4)
2835	    {
2836#if MMX_ASMBLIT
2837		if(sf->Rshift % 8 == 0
2838		   && sf->Gshift % 8 == 0
2839		   && sf->Bshift % 8 == 0
2840		   && sf->Ashift % 8 == 0
2841		   && sf->Aloss == 0)
2842		{
2843			if(SDL_Has3DNow())
2844				return BlitRGBtoRGBPixelAlphaMMX3DNOW;
2845			if(SDL_HasMMX())
2846				return BlitRGBtoRGBPixelAlphaMMX;
2847		}
2848#endif
2849		if(sf->Amask == 0xff000000)
2850		{
2851#if SDL_ALTIVEC_BLITTERS
2852			if(!(surface->map->dst->flags & SDL_HWSURFACE)
2853				&& SDL_HasAltiVec())
2854				return BlitRGBtoRGBPixelAlphaAltivec;
2855#endif
2856			return BlitRGBtoRGBPixelAlpha;
2857		}
2858	    }
2859#if SDL_ALTIVEC_BLITTERS
2860	    if (sf->Amask && sf->BytesPerPixel == 4 &&
2861	        !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2862		return Blit32to32PixelAlphaAltivec;
2863	    else
2864#endif
2865		return BlitNtoNPixelAlpha;
2866
2867	case 3:
2868	default:
2869	    return BlitNtoNPixelAlpha;
2870	}
2871    }
2872}
2873
2874