1/*
2    SDL - Simple DirectMedia Layer
3    Copyright (C) 1997-2006 Sam Lantinga
4
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
9
10    This library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public
16    License along with this library; if not, write to the Free Software
17    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
18
19    Sam Lantinga
20    slouken@libsdl.org
21*/
22#include "SDL_config.h"
23
24#include "SDL_video.h"
25#include "SDL_blit.h"
26
27/*
28  In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
29   Checking if _mm_free is #defined in malloc.h is is the only way to
30   determine if the Processor Pack is installed, as far as I can tell.
31*/
32
33#if SDL_ASSEMBLY_ROUTINES
34#  if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
35#    define MMX_ASMBLIT 1
36#    define GCC_ASMBLIT 1
37#  elif defined(_MSC_VER) && defined(_M_IX86)
38#    if (_MSC_VER <= 1200)
39#      include <malloc.h>
40#      if defined(_mm_free)
41#          define HAVE_MMINTRIN_H 1
42#      endif
43#    else  /* Visual Studio > VC6 always has mmintrin.h */
44#      define HAVE_MMINTRIN_H 1
45#    endif
46#    if HAVE_MMINTRIN_H
47#      define MMX_ASMBLIT 1
48#      define MSVC_ASMBLIT 1
49#    endif
50#  endif
51#endif /* SDL_ASSEMBLY_ROUTINES */
52
53/* Function to check the CPU flags */
54#include "SDL_cpuinfo.h"
55#if GCC_ASMBLIT
56#include "mmx.h"
57#elif MSVC_ASMBLIT
58#include <mmintrin.h>
59#include <mm3dnow.h>
60#endif
61
62/* Functions to perform alpha blended blitting */
63
64/* N->1 blending with per-surface alpha */
65static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
66{
67	int width = info->d_width;
68	int height = info->d_height;
69	Uint8 *src = info->s_pixels;
70	int srcskip = info->s_skip;
71	Uint8 *dst = info->d_pixels;
72	int dstskip = info->d_skip;
73	Uint8 *palmap = info->table;
74	SDL_PixelFormat *srcfmt = info->src;
75	SDL_PixelFormat *dstfmt = info->dst;
76	int srcbpp = srcfmt->BytesPerPixel;
77
78	const unsigned A = srcfmt->alpha;
79
80	while ( height-- ) {
81	    DUFFS_LOOP4(
82	    {
83		Uint32 Pixel;
84		unsigned sR;
85		unsigned sG;
86		unsigned sB;
87		unsigned dR;
88		unsigned dG;
89		unsigned dB;
90		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
91		dR = dstfmt->palette->colors[*dst].r;
92		dG = dstfmt->palette->colors[*dst].g;
93		dB = dstfmt->palette->colors[*dst].b;
94		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
95		dR &= 0xff;
96		dG &= 0xff;
97		dB &= 0xff;
98		/* Pack RGB into 8bit pixel */
99		if ( palmap == NULL ) {
100		    *dst =((dR>>5)<<(3+2))|
101			  ((dG>>5)<<(2))|
102			  ((dB>>6)<<(0));
103		} else {
104		    *dst = palmap[((dR>>5)<<(3+2))|
105				  ((dG>>5)<<(2))  |
106				  ((dB>>6)<<(0))];
107		}
108		dst++;
109		src += srcbpp;
110	    },
111	    width);
112	    src += srcskip;
113	    dst += dstskip;
114	}
115}
116
117/* N->1 blending with pixel alpha */
118static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
119{
120	int width = info->d_width;
121	int height = info->d_height;
122	Uint8 *src = info->s_pixels;
123	int srcskip = info->s_skip;
124	Uint8 *dst = info->d_pixels;
125	int dstskip = info->d_skip;
126	Uint8 *palmap = info->table;
127	SDL_PixelFormat *srcfmt = info->src;
128	SDL_PixelFormat *dstfmt = info->dst;
129	int srcbpp = srcfmt->BytesPerPixel;
130
131	/* FIXME: fix alpha bit field expansion here too? */
132	while ( height-- ) {
133	    DUFFS_LOOP4(
134	    {
135		Uint32 Pixel;
136		unsigned sR;
137		unsigned sG;
138		unsigned sB;
139		unsigned sA;
140		unsigned dR;
141		unsigned dG;
142		unsigned dB;
143		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
144		dR = dstfmt->palette->colors[*dst].r;
145		dG = dstfmt->palette->colors[*dst].g;
146		dB = dstfmt->palette->colors[*dst].b;
147		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
148		dR &= 0xff;
149		dG &= 0xff;
150		dB &= 0xff;
151		/* Pack RGB into 8bit pixel */
152		if ( palmap == NULL ) {
153		    *dst =((dR>>5)<<(3+2))|
154			  ((dG>>5)<<(2))|
155			  ((dB>>6)<<(0));
156		} else {
157		    *dst = palmap[((dR>>5)<<(3+2))|
158				  ((dG>>5)<<(2))  |
159				  ((dB>>6)<<(0))  ];
160		}
161		dst++;
162		src += srcbpp;
163	    },
164	    width);
165	    src += srcskip;
166	    dst += dstskip;
167	}
168}
169
170/* colorkeyed N->1 blending with per-surface alpha */
171static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
172{
173	int width = info->d_width;
174	int height = info->d_height;
175	Uint8 *src = info->s_pixels;
176	int srcskip = info->s_skip;
177	Uint8 *dst = info->d_pixels;
178	int dstskip = info->d_skip;
179	Uint8 *palmap = info->table;
180	SDL_PixelFormat *srcfmt = info->src;
181	SDL_PixelFormat *dstfmt = info->dst;
182	int srcbpp = srcfmt->BytesPerPixel;
183	Uint32 ckey = srcfmt->colorkey;
184
185	const int A = srcfmt->alpha;
186
187	while ( height-- ) {
188	    DUFFS_LOOP(
189	    {
190		Uint32 Pixel;
191		unsigned sR;
192		unsigned sG;
193		unsigned sB;
194		unsigned dR;
195		unsigned dG;
196		unsigned dB;
197		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
198		if ( Pixel != ckey ) {
199		    dR = dstfmt->palette->colors[*dst].r;
200		    dG = dstfmt->palette->colors[*dst].g;
201		    dB = dstfmt->palette->colors[*dst].b;
202		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
203		    dR &= 0xff;
204		    dG &= 0xff;
205		    dB &= 0xff;
206		    /* Pack RGB into 8bit pixel */
207		    if ( palmap == NULL ) {
208			*dst =((dR>>5)<<(3+2))|
209			      ((dG>>5)<<(2)) |
210			      ((dB>>6)<<(0));
211		    } else {
212			*dst = palmap[((dR>>5)<<(3+2))|
213				      ((dG>>5)<<(2))  |
214				      ((dB>>6)<<(0))  ];
215		    }
216		}
217		dst++;
218		src += srcbpp;
219	    },
220	    width);
221	    src += srcskip;
222	    dst += dstskip;
223	}
224}
225
226#if GCC_ASMBLIT
227/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
228static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
229{
230	int width = info->d_width;
231	int height = info->d_height;
232	Uint32 *srcp = (Uint32 *)info->s_pixels;
233	int srcskip = info->s_skip >> 2;
234	Uint32 *dstp = (Uint32 *)info->d_pixels;
235	int dstskip = info->d_skip >> 2;
236	Uint32 dalpha = info->dst->Amask;
237	Uint8 load[8];
238
239	*(Uint64 *)load = 0x00fefefe00fefefeULL;/* alpha128 mask */
240	movq_m2r(*load, mm4); /* alpha128 mask -> mm4 */
241	*(Uint64 *)load = 0x0001010100010101ULL;/* !alpha128 mask */
242	movq_m2r(*load, mm3); /* !alpha128 mask -> mm3 */
243	movd_m2r(dalpha, mm7); /* dst alpha mask */
244	punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
245	while(height--) {
246		DUFFS_LOOP_DOUBLE2(
247		{
248			Uint32 s = *srcp++;
249			Uint32 d = *dstp;
250			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
251				   + (s & d & 0x00010101)) | dalpha;
252		},{
253			movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
254			movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
255
256			movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
257			movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
258
259			pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
260			pand_r2r(mm4, mm5); /* src & mask -> mm5 */
261			paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
262			pand_r2r(mm1, mm2); /* src & dst -> mm2 */
263			psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
264			pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
265			paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
266
267			por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
268			movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
269			dstp += 2;
270			srcp += 2;
271		}, width);
272		srcp += srcskip;
273		dstp += dstskip;
274	}
275	emms();
276}
277
278/* fast RGB888->(A)RGB888 blending with surface alpha */
279static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
280{
281	SDL_PixelFormat* df = info->dst;
282	unsigned alpha = info->src->alpha;
283
284	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
285			/* only call a128 version when R,G,B occupy lower bits */
286		BlitRGBtoRGBSurfaceAlpha128MMX(info);
287	} else {
288		int width = info->d_width;
289		int height = info->d_height;
290		Uint32 *srcp = (Uint32 *)info->s_pixels;
291		int srcskip = info->s_skip >> 2;
292		Uint32 *dstp = (Uint32 *)info->d_pixels;
293		int dstskip = info->d_skip >> 2;
294
295		pxor_r2r(mm5, mm5); /* 0 -> mm5 */
296		/* form the alpha mult */
297		movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
298		punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
299		punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
300		alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
301		movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
302		punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
303		pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
304			/* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
305		movd_m2r(df->Amask, mm7); /* dst alpha mask */
306		punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
307
308		while(height--) {
309			DUFFS_LOOP_DOUBLE2({
310				/* One Pixel Blend */
311				movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
312				movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
313				punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
314				punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
315
316				psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
317				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
318				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
319				paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
320
321				packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
322				por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
323				movd_r2m(mm2, *dstp);/* mm2 -> pixel */
324				++srcp;
325				++dstp;
326			},{
327				/* Two Pixels Blend */
328				movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
329				movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
330				movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
331				movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
332
333				punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
334				punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
335				punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
336				punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
337
338				psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
339				pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
340				psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
341				paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
342
343				psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
344				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
345				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
346				paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
347
348				packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
349				por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
350
351				movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
352
353  				srcp += 2;
354  				dstp += 2;
355  			}, width);
356			srcp += srcskip;
357			dstp += dstskip;
358		}
359		emms();
360	}
361}
362
363/* fast ARGB888->(A)RGB888 blending with pixel alpha */
364static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
365{
366	int width = info->d_width;
367	int height = info->d_height;
368	Uint32 *srcp = (Uint32 *)info->s_pixels;
369	int srcskip = info->s_skip >> 2;
370	Uint32 *dstp = (Uint32 *)info->d_pixels;
371	int dstskip = info->d_skip >> 2;
372	SDL_PixelFormat* sf = info->src;
373	Uint32 amask = sf->Amask;
374
375	pxor_r2r(mm6, mm6); /* 0 -> mm6 */
376	/* form multiplication mask */
377	movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
378	punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
379	pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
380	movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
381	pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
382	/* form channel masks */
383	movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
384	packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
385	packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
386	pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
387	/* get alpha channel shift */
388	__asm__ __volatile__ (
389		"movd %0, %%mm5"
390		: : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
391
392	while(height--) {
393	    DUFFS_LOOP4({
394		Uint32 alpha = *srcp & amask;
395		/* FIXME: Here we special-case opaque alpha since the
396			compositioning used (>>8 instead of /255) doesn't handle
397			it correctly. Also special-case alpha=0 for speed?
398			Benchmark this! */
399		if(alpha == 0) {
400			/* do nothing */
401		} else if(alpha == amask) {
402			/* opaque alpha -- copy RGB, keep dst alpha */
403			/* using MMX here to free up regular registers for other things */
404			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
405			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
406			pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
407			pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
408			por_r2r(mm1, mm2); /* src | dst -> mm2 */
409			movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
410		} else {
411			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
412			punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
413
414			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
415			punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
416
417			__asm__ __volatile__ (
418				"movd %0, %%mm4"
419				: : "r" (alpha) ); /* 0000A000 -> mm4 */
420			psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
421			punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
422			punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
423			pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
424
425			/* blend */
426			psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
427			pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
428			psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
429			paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
430
431			packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
432			movd_r2m(mm2, *dstp);/* mm2 -> dst */
433		}
434		++srcp;
435		++dstp;
436	    }, width);
437	    srcp += srcskip;
438	    dstp += dstskip;
439	}
440	emms();
441}
442/* End GCC_ASMBLIT */
443
444#elif MSVC_ASMBLIT
445/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
446static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
447{
448	int width = info->d_width;
449	int height = info->d_height;
450	Uint32 *srcp = (Uint32 *)info->s_pixels;
451	int srcskip = info->s_skip >> 2;
452	Uint32 *dstp = (Uint32 *)info->d_pixels;
453	int dstskip = info->d_skip >> 2;
454	Uint32 dalpha = info->dst->Amask;
455
456	__m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
457
458	hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
459	lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
460	dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
461
462	while (height--) {
463		int n = width;
464		if ( n & 1 ) {
465			Uint32 s = *srcp++;
466			Uint32 d = *dstp;
467			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
468				   + (s & d & 0x00010101)) | dalpha;
469			n--;
470		}
471
472		for (n >>= 1; n > 0; --n) {
473			dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
474			dst2 = dst1;   /* 2 x dst -> dst2(ARGBARGB) */
475
476			src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
477			src2 = src1; /* 2 x src -> src2(ARGBARGB) */
478
479			dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
480			src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
481			src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
482			src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
483
484			dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
485			dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
486			dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
487			dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
488
489			*(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
490			dstp += 2;
491			srcp += 2;
492		}
493
494		srcp += srcskip;
495		dstp += dstskip;
496	}
497	_mm_empty();
498}
499
500/* fast RGB888->(A)RGB888 blending with surface alpha */
501static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
502{
503	SDL_PixelFormat* df = info->dst;
504	Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
505	unsigned alpha = info->src->alpha;
506
507	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
508			/* only call a128 version when R,G,B occupy lower bits */
509		BlitRGBtoRGBSurfaceAlpha128MMX(info);
510	} else {
511		int width = info->d_width;
512		int height = info->d_height;
513		Uint32 *srcp = (Uint32 *)info->s_pixels;
514		int srcskip = info->s_skip >> 2;
515		Uint32 *dstp = (Uint32 *)info->d_pixels;
516		int dstskip = info->d_skip >> 2;
517		Uint32 dalpha = df->Amask;
518		Uint32 amult;
519
520		__m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
521
522		mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
523		/* form the alpha mult */
524		amult = alpha | (alpha << 8);
525		amult = amult | (amult << 16);
526		chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
527		mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
528		mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
529			/* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
530		dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
531
532		while (height--) {
533			int n = width;
534			if (n & 1) {
535				/* One Pixel Blend */
536				src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
537				src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
538
539				dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
540				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
541
542				src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
543				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
544				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
545				dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
546
547				dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
548				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
549				*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
550
551				++srcp;
552				++dstp;
553
554				n--;
555			}
556
557			for (n >>= 1; n > 0; --n) {
558				/* Two Pixels Blend */
559				src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
560				src2 = src1; /* 2 x src -> src2(ARGBARGB) */
561				src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
562				src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
563
564				dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
565				dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
566				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
567				dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
568
569				src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
570				src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
571				src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
572				dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
573
574				src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
575				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
576				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
577				dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
578
579				dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
580				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
581
582				*(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
583
584				srcp += 2;
585				dstp += 2;
586			}
587			srcp += srcskip;
588			dstp += dstskip;
589		}
590		_mm_empty();
591	}
592}
593
594/* fast ARGB888->(A)RGB888 blending with pixel alpha */
595static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
596{
597	int width = info->d_width;
598	int height = info->d_height;
599	Uint32 *srcp = (Uint32 *)info->s_pixels;
600	int srcskip = info->s_skip >> 2;
601	Uint32 *dstp = (Uint32 *)info->d_pixels;
602	int dstskip = info->d_skip >> 2;
603	SDL_PixelFormat* sf = info->src;
604	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
605	Uint32 amask = sf->Amask;
606	Uint32 ashift = sf->Ashift;
607	Uint64 multmask;
608
609	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
610
611	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
612	multmask = ~(0xFFFFi64 << (ashift * 2));
613	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
614
615	while(height--) {
616		DUFFS_LOOP4({
617		Uint32 alpha = *srcp & amask;
618		if (alpha == 0) {
619			/* do nothing */
620		} else if (alpha == amask) {
621			/* opaque alpha -- copy RGB, keep dst alpha */
622			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
623		} else {
624			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
625			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
626
627			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
628			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
629
630			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
631			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
632			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
633			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
634			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
635
636			/* blend */
637			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
638			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
639			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
640			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
641			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
642
643			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
644		}
645		++srcp;
646		++dstp;
647	    }, width);
648	    srcp += srcskip;
649	    dstp += dstskip;
650	}
651	_mm_empty();
652}
653/* End MSVC_ASMBLIT */
654
655#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
656
657#if SDL_ALTIVEC_BLITTERS
658#if __MWERKS__
659#pragma altivec_model on
660#endif
661#if HAVE_ALTIVEC_H
662#include <altivec.h>
663#endif
664#include <assert.h>
665
666#if (defined(__MACOSX__) && (__GNUC__ < 4))
667    #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
668        (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
669    #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
670        (vector unsigned short) ( a,b,c,d,e,f,g,h )
671#else
672    #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
673        (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
674    #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
675        (vector unsigned short) { a,b,c,d,e,f,g,h }
676#endif
677
678#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
679#define VECPRINT(msg, v) do { \
680    vector unsigned int tmpvec = (vector unsigned int)(v); \
681    unsigned int *vp = (unsigned int *)&tmpvec; \
682    printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
683} while (0)
684
685/* the permuation vector that takes the high bytes out of all the appropriate shorts
686    (vector unsigned char)(
687        0x00, 0x10, 0x02, 0x12,
688        0x04, 0x14, 0x06, 0x16,
689        0x08, 0x18, 0x0A, 0x1A,
690        0x0C, 0x1C, 0x0E, 0x1E );
691*/
692#define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
693#define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
694#define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
695#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
696    ? vec_lvsl(0, src) \
697    : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
698
699
700#define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
701    /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
702    vector unsigned short vtemp1 = vec_mule(vs, valpha); \
703    /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
704    vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
705    /* valpha2 is 255-alpha */ \
706    vector unsigned char valpha2 = vec_nor(valpha, valpha); \
707    /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
708    vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
709    /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
710    vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
711    /* add source and dest */ \
712    vtemp1 = vec_add(vtemp1, vtemp3); \
713    vtemp2 = vec_add(vtemp2, vtemp4); \
714    /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
715    vtemp1 = vec_add(vtemp1, v1_16); \
716    vtemp3 = vec_sr(vtemp1, v8_16); \
717    vtemp1 = vec_add(vtemp1, vtemp3); \
718    /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
719    vtemp2 = vec_add(vtemp2, v1_16); \
720    vtemp4 = vec_sr(vtemp2, v8_16); \
721    vtemp2 = vec_add(vtemp2, vtemp4); \
722    /* (>>8) and get ARGBARGBARGBARGB */ \
723    vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
724} while (0)
725
726/* Calculate the permute vector used for 32->32 swizzling */
727static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
728                                  const SDL_PixelFormat *dstfmt)
729{
730    /*
731     * We have to assume that the bits that aren't used by other
732     *  colors is alpha, and it's one complete byte, since some formats
733     *  leave alpha with a zero mask, but we should still swizzle the bits.
734     */
735    /* ARGB */
736    const static struct SDL_PixelFormat default_pixel_format = {
737        NULL, 0, 0,
738        0, 0, 0, 0,
739        16, 8, 0, 24,
740        0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
741        0, 0};
742    if (!srcfmt) {
743        srcfmt = &default_pixel_format;
744    }
745    if (!dstfmt) {
746        dstfmt = &default_pixel_format;
747    }
748    const vector unsigned char plus = VECUINT8_LITERAL
749                                            ( 0x00, 0x00, 0x00, 0x00,
750                                              0x04, 0x04, 0x04, 0x04,
751                                              0x08, 0x08, 0x08, 0x08,
752                                              0x0C, 0x0C, 0x0C, 0x0C );
753    vector unsigned char vswiz;
754    vector unsigned int srcvec;
755#define RESHIFT(X) (3 - ((X) >> 3))
756    Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
757    Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
758    Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
759    Uint32 amask;
760    /* Use zero for alpha if either surface doesn't have alpha */
761    if (dstfmt->Amask) {
762        amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
763    } else {
764        amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
765    }
766#undef RESHIFT
767    ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
768    vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
769    return(vswiz);
770}
771
772static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
773{
774    int height = info->d_height;
775    Uint8 *src = (Uint8 *)info->s_pixels;
776    int srcskip = info->s_skip;
777    Uint8 *dst = (Uint8 *)info->d_pixels;
778    int dstskip = info->d_skip;
779    SDL_PixelFormat *srcfmt = info->src;
780
781    vector unsigned char v0 = vec_splat_u8(0);
782    vector unsigned short v8_16 = vec_splat_u16(8);
783    vector unsigned short v1_16 = vec_splat_u16(1);
784    vector unsigned short v2_16 = vec_splat_u16(2);
785    vector unsigned short v3_16 = vec_splat_u16(3);
786    vector unsigned int v8_32 = vec_splat_u32(8);
787    vector unsigned int v16_32 = vec_add(v8_32, v8_32);
788    vector unsigned short v3f = VECUINT16_LITERAL(
789        0x003f, 0x003f, 0x003f, 0x003f,
790        0x003f, 0x003f, 0x003f, 0x003f);
791    vector unsigned short vfc = VECUINT16_LITERAL(
792        0x00fc, 0x00fc, 0x00fc, 0x00fc,
793        0x00fc, 0x00fc, 0x00fc, 0x00fc);
794
795    /*
796        0x10 - 0x1f is the alpha
797        0x00 - 0x0e evens are the red
798        0x01 - 0x0f odds are zero
799    */
800    vector unsigned char vredalpha1 = VECUINT8_LITERAL(
801        0x10, 0x00, 0x01, 0x01,
802        0x10, 0x02, 0x01, 0x01,
803        0x10, 0x04, 0x01, 0x01,
804        0x10, 0x06, 0x01, 0x01
805    );
806    vector unsigned char vredalpha2 = (vector unsigned char)(
807        vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
808    );
809    /*
810        0x00 - 0x0f is ARxx ARxx ARxx ARxx
811        0x11 - 0x0f odds are blue
812    */
813    vector unsigned char vblue1 = VECUINT8_LITERAL(
814        0x00, 0x01, 0x02, 0x11,
815        0x04, 0x05, 0x06, 0x13,
816        0x08, 0x09, 0x0a, 0x15,
817        0x0c, 0x0d, 0x0e, 0x17
818    );
819    vector unsigned char vblue2 = (vector unsigned char)(
820        vec_add((vector unsigned int)vblue1, v8_32)
821    );
822    /*
823        0x00 - 0x0f is ARxB ARxB ARxB ARxB
824        0x10 - 0x0e evens are green
825    */
826    vector unsigned char vgreen1 = VECUINT8_LITERAL(
827        0x00, 0x01, 0x10, 0x03,
828        0x04, 0x05, 0x12, 0x07,
829        0x08, 0x09, 0x14, 0x0b,
830        0x0c, 0x0d, 0x16, 0x0f
831    );
832    vector unsigned char vgreen2 = (vector unsigned char)(
833        vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
834    );
835    vector unsigned char vgmerge = VECUINT8_LITERAL(
836        0x00, 0x02, 0x00, 0x06,
837        0x00, 0x0a, 0x00, 0x0e,
838        0x00, 0x12, 0x00, 0x16,
839        0x00, 0x1a, 0x00, 0x1e);
840    vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
841    vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
842    vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
843
844    vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
845    vf800 = vec_sl(vf800, vec_splat_u16(8));
846
847    while(height--) {
848        int extrawidth;
849        vector unsigned char valigner;
850        vector unsigned char vsrc;
851        vector unsigned char voverflow;
852        int width = info->d_width;
853
854#define ONE_PIXEL_BLEND(condition, widthvar) \
855        while (condition) { \
856            Uint32 Pixel; \
857            unsigned sR, sG, sB, dR, dG, dB, sA; \
858            DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
859            if(sA) { \
860                unsigned short dstpixel = *((unsigned short *)dst); \
861                dR = (dstpixel >> 8) & 0xf8; \
862                dG = (dstpixel >> 3) & 0xfc; \
863                dB = (dstpixel << 3) & 0xf8; \
864                ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
865                *((unsigned short *)dst) = ( \
866                    ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
867                ); \
868            } \
869            src += 4; \
870            dst += 2; \
871            widthvar--; \
872        }
873        ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
874        extrawidth = (width % 8);
875        valigner = VEC_ALIGNER(src);
876        vsrc = (vector unsigned char)vec_ld(0, src);
877        width -= extrawidth;
878        while (width) {
879            vector unsigned char valpha;
880            vector unsigned char vsrc1, vsrc2;
881            vector unsigned char vdst1, vdst2;
882            vector unsigned short vR, vG, vB;
883            vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
884
885            /* Load 8 pixels from src as ARGB */
886            voverflow = (vector unsigned char)vec_ld(15, src);
887            vsrc = vec_perm(vsrc, voverflow, valigner);
888            vsrc1 = vec_perm(vsrc, vsrc, vpermute);
889            src += 16;
890            vsrc = (vector unsigned char)vec_ld(15, src);
891            voverflow = vec_perm(voverflow, vsrc, valigner);
892            vsrc2 = vec_perm(voverflow, voverflow, vpermute);
893            src += 16;
894
895            /* Load 8 pixels from dst as XRGB */
896            voverflow = vec_ld(0, dst);
897            vR = vec_and((vector unsigned short)voverflow, vf800);
898            vB = vec_sl((vector unsigned short)voverflow, v3_16);
899            vG = vec_sl(vB, v2_16);
900            vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
901            vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
902            vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
903            vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
904            vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
905            vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
906
907            /* Alpha blend 8 pixels as ARGB */
908            valpha = vec_perm(vsrc1, v0, valphaPermute);
909            VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
910            valpha = vec_perm(vsrc2, v0, valphaPermute);
911            VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
912
913            /* Convert 8 pixels to 565 */
914            vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
915            vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
916            vgpixel = vec_and(vgpixel, vfc);
917            vgpixel = vec_sl(vgpixel, v3_16);
918            vrpixel = vec_sl(vpixel, v1_16);
919            vrpixel = vec_and(vrpixel, vf800);
920            vbpixel = vec_and(vpixel, v3f);
921            vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
922            vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
923
924            /* Store 8 pixels */
925            vec_st(vdst1, 0, dst);
926
927            width -= 8;
928            dst += 16;
929        }
930        ONE_PIXEL_BLEND((extrawidth), extrawidth);
931#undef ONE_PIXEL_BLEND
932        src += srcskip;
933        dst += dstskip;
934    }
935}
936
937static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
938{
939    unsigned alpha = info->src->alpha;
940    int height = info->d_height;
941    Uint32 *srcp = (Uint32 *)info->s_pixels;
942    int srcskip = info->s_skip >> 2;
943    Uint32 *dstp = (Uint32 *)info->d_pixels;
944    int dstskip = info->d_skip >> 2;
945    SDL_PixelFormat *srcfmt = info->src;
946    SDL_PixelFormat *dstfmt = info->dst;
947    unsigned sA = srcfmt->alpha;
948    unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
949    Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
950    Uint32 ckey = info->src->colorkey;
951    vector unsigned char mergePermute;
952    vector unsigned char vsrcPermute;
953    vector unsigned char vdstPermute;
954    vector unsigned char vsdstPermute;
955    vector unsigned char valpha;
956    vector unsigned char valphamask;
957    vector unsigned char vbits;
958    vector unsigned char v0;
959    vector unsigned short v1;
960    vector unsigned short v8;
961    vector unsigned int vckey;
962    vector unsigned int vrgbmask;
963
964    mergePermute = VEC_MERGE_PERMUTE();
965    v0 = vec_splat_u8(0);
966    v1 = vec_splat_u16(1);
967    v8 = vec_splat_u16(8);
968
969    /* set the alpha to 255 on the destination surf */
970    valphamask = VEC_ALPHA_MASK();
971
972    vsrcPermute = calc_swizzle32(srcfmt, NULL);
973    vdstPermute = calc_swizzle32(NULL, dstfmt);
974    vsdstPermute = calc_swizzle32(dstfmt, NULL);
975
976    /* set a vector full of alpha and 255-alpha */
977    ((unsigned char *)&valpha)[0] = alpha;
978    valpha = vec_splat(valpha, 0);
979    vbits = (vector unsigned char)vec_splat_s8(-1);
980
981    ckey &= rgbmask;
982    ((unsigned int *)(char*)&vckey)[0] = ckey;
983    vckey = vec_splat(vckey, 0);
984    ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
985    vrgbmask = vec_splat(vrgbmask, 0);
986
987    while(height--) {
988        int width = info->d_width;
989#define ONE_PIXEL_BLEND(condition, widthvar) \
990        while (condition) { \
991            Uint32 Pixel; \
992            unsigned sR, sG, sB, dR, dG, dB; \
993            RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
994            if(sA && Pixel != ckey) { \
995                RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
996                DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
997                ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
998                ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
999            } \
1000            dstp++; \
1001            srcp++; \
1002            widthvar--; \
1003        }
1004        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1005        if (width > 0) {
1006            int extrawidth = (width % 4);
1007            vector unsigned char valigner = VEC_ALIGNER(srcp);
1008            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1009            width -= extrawidth;
1010            while (width) {
1011                vector unsigned char vsel;
1012                vector unsigned char voverflow;
1013                vector unsigned char vd;
1014                vector unsigned char vd_orig;
1015
1016                /* s = *srcp */
1017                voverflow = (vector unsigned char)vec_ld(15, srcp);
1018                vs = vec_perm(vs, voverflow, valigner);
1019
1020                /* vsel is set for items that match the key */
1021                vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
1022                vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
1023
1024                /* permute to source format */
1025                vs = vec_perm(vs, valpha, vsrcPermute);
1026
1027                /* d = *dstp */
1028                vd = (vector unsigned char)vec_ld(0, dstp);
1029                vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
1030
1031                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1032
1033                /* set the alpha channel to full on */
1034                vd = vec_or(vd, valphamask);
1035
1036                /* mask out color key */
1037                vd = vec_sel(vd, vd_orig, vsel);
1038
1039                /* permute to dest format */
1040                vd = vec_perm(vd, vbits, vdstPermute);
1041
1042                /* *dstp = res */
1043                vec_st((vector unsigned int)vd, 0, dstp);
1044
1045                srcp += 4;
1046                dstp += 4;
1047                width -= 4;
1048                vs = voverflow;
1049            }
1050            ONE_PIXEL_BLEND((extrawidth), extrawidth);
1051        }
1052#undef ONE_PIXEL_BLEND
1053
1054        srcp += srcskip;
1055        dstp += dstskip;
1056    }
1057}
1058
1059
1060static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
1061{
1062    int width = info->d_width;
1063    int height = info->d_height;
1064    Uint32 *srcp = (Uint32 *)info->s_pixels;
1065    int srcskip = info->s_skip >> 2;
1066    Uint32 *dstp = (Uint32 *)info->d_pixels;
1067    int dstskip = info->d_skip >> 2;
1068    SDL_PixelFormat *srcfmt = info->src;
1069    SDL_PixelFormat *dstfmt = info->dst;
1070    vector unsigned char mergePermute;
1071    vector unsigned char valphaPermute;
1072    vector unsigned char vsrcPermute;
1073    vector unsigned char vdstPermute;
1074    vector unsigned char vsdstPermute;
1075    vector unsigned char valphamask;
1076    vector unsigned char vpixelmask;
1077    vector unsigned char v0;
1078    vector unsigned short v1;
1079    vector unsigned short v8;
1080
1081    v0 = vec_splat_u8(0);
1082    v1 = vec_splat_u16(1);
1083    v8 = vec_splat_u16(8);
1084    mergePermute = VEC_MERGE_PERMUTE();
1085    valphamask = VEC_ALPHA_MASK();
1086    valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1087    vpixelmask = vec_nor(valphamask, v0);
1088    vsrcPermute = calc_swizzle32(srcfmt, NULL);
1089    vdstPermute = calc_swizzle32(NULL, dstfmt);
1090    vsdstPermute = calc_swizzle32(dstfmt, NULL);
1091
1092	while ( height-- ) {
1093        width = info->d_width;
1094#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1095            Uint32 Pixel; \
1096            unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
1097            DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
1098            if(sA) { \
1099              DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
1100              ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1101              ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
1102            } \
1103            ++srcp; \
1104            ++dstp; \
1105            widthvar--; \
1106        }
1107        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1108        if (width > 0) {
1109            /* vsrcPermute */
1110            /* vdstPermute */
1111            int extrawidth = (width % 4);
1112            vector unsigned char valigner = VEC_ALIGNER(srcp);
1113            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1114            width -= extrawidth;
1115            while (width) {
1116                vector unsigned char voverflow;
1117                vector unsigned char vd;
1118                vector unsigned char valpha;
1119                vector unsigned char vdstalpha;
1120                /* s = *srcp */
1121                voverflow = (vector unsigned char)vec_ld(15, srcp);
1122                vs = vec_perm(vs, voverflow, valigner);
1123                vs = vec_perm(vs, v0, vsrcPermute);
1124
1125                valpha = vec_perm(vs, v0, valphaPermute);
1126
1127                /* d = *dstp */
1128                vd = (vector unsigned char)vec_ld(0, dstp);
1129                vd = vec_perm(vd, v0, vsdstPermute);
1130                vdstalpha = vec_and(vd, valphamask);
1131
1132                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1133
1134                /* set the alpha to the dest alpha */
1135                vd = vec_and(vd, vpixelmask);
1136                vd = vec_or(vd, vdstalpha);
1137                vd = vec_perm(vd, v0, vdstPermute);
1138
1139                /* *dstp = res */
1140                vec_st((vector unsigned int)vd, 0, dstp);
1141
1142                srcp += 4;
1143                dstp += 4;
1144                width -= 4;
1145                vs = voverflow;
1146
1147            }
1148            ONE_PIXEL_BLEND((extrawidth), extrawidth);
1149        }
1150	    srcp += srcskip;
1151	    dstp += dstskip;
1152#undef ONE_PIXEL_BLEND
1153	}
1154}
1155
1156/* fast ARGB888->(A)RGB888 blending with pixel alpha */
1157static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
1158{
1159	int width = info->d_width;
1160	int height = info->d_height;
1161	Uint32 *srcp = (Uint32 *)info->s_pixels;
1162	int srcskip = info->s_skip >> 2;
1163	Uint32 *dstp = (Uint32 *)info->d_pixels;
1164	int dstskip = info->d_skip >> 2;
1165    vector unsigned char mergePermute;
1166    vector unsigned char valphaPermute;
1167    vector unsigned char valphamask;
1168    vector unsigned char vpixelmask;
1169    vector unsigned char v0;
1170    vector unsigned short v1;
1171    vector unsigned short v8;
1172    v0 = vec_splat_u8(0);
1173    v1 = vec_splat_u16(1);
1174    v8 = vec_splat_u16(8);
1175    mergePermute = VEC_MERGE_PERMUTE();
1176    valphamask = VEC_ALPHA_MASK();
1177    valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
1178
1179
1180    vpixelmask = vec_nor(valphamask, v0);
1181	while(height--) {
1182        width = info->d_width;
1183#define ONE_PIXEL_BLEND(condition, widthvar) \
1184        while ((condition)) { \
1185            Uint32 dalpha; \
1186            Uint32 d; \
1187            Uint32 s1; \
1188            Uint32 d1; \
1189            Uint32 s = *srcp; \
1190            Uint32 alpha = s >> 24; \
1191            if(alpha) { \
1192              if(alpha == SDL_ALPHA_OPAQUE) { \
1193                *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
1194              } else { \
1195                d = *dstp; \
1196                dalpha = d & 0xff000000; \
1197                s1 = s & 0xff00ff; \
1198                d1 = d & 0xff00ff; \
1199                d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
1200                s &= 0xff00; \
1201                d &= 0xff00; \
1202                d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1203                *dstp = d1 | d | dalpha; \
1204              } \
1205            } \
1206            ++srcp; \
1207            ++dstp; \
1208            widthvar--; \
1209	    }
1210        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1211        if (width > 0) {
1212            int extrawidth = (width % 4);
1213            vector unsigned char valigner = VEC_ALIGNER(srcp);
1214            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1215            width -= extrawidth;
1216            while (width) {
1217                vector unsigned char voverflow;
1218                vector unsigned char vd;
1219                vector unsigned char valpha;
1220                vector unsigned char vdstalpha;
1221                /* s = *srcp */
1222                voverflow = (vector unsigned char)vec_ld(15, srcp);
1223                vs = vec_perm(vs, voverflow, valigner);
1224
1225                valpha = vec_perm(vs, v0, valphaPermute);
1226
1227                /* d = *dstp */
1228                vd = (vector unsigned char)vec_ld(0, dstp);
1229                vdstalpha = vec_and(vd, valphamask);
1230
1231                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1232
1233                /* set the alpha to the dest alpha */
1234                vd = vec_and(vd, vpixelmask);
1235                vd = vec_or(vd, vdstalpha);
1236
1237                /* *dstp = res */
1238                vec_st((vector unsigned int)vd, 0, dstp);
1239
1240                srcp += 4;
1241                dstp += 4;
1242                width -= 4;
1243                vs = voverflow;
1244            }
1245            ONE_PIXEL_BLEND((extrawidth), extrawidth);
1246        }
1247	    srcp += srcskip;
1248	    dstp += dstskip;
1249	}
1250#undef ONE_PIXEL_BLEND
1251}
1252
1253static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
1254{
1255    /* XXX : 6 */
1256	unsigned alpha = info->src->alpha;
1257    int height = info->d_height;
1258    Uint32 *srcp = (Uint32 *)info->s_pixels;
1259    int srcskip = info->s_skip >> 2;
1260    Uint32 *dstp = (Uint32 *)info->d_pixels;
1261    int dstskip = info->d_skip >> 2;
1262    SDL_PixelFormat *srcfmt = info->src;
1263    SDL_PixelFormat *dstfmt = info->dst;
1264	unsigned sA = srcfmt->alpha;
1265	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
1266    vector unsigned char mergePermute;
1267    vector unsigned char vsrcPermute;
1268    vector unsigned char vdstPermute;
1269    vector unsigned char vsdstPermute;
1270    vector unsigned char valpha;
1271    vector unsigned char valphamask;
1272    vector unsigned char vbits;
1273    vector unsigned short v1;
1274    vector unsigned short v8;
1275
1276    mergePermute = VEC_MERGE_PERMUTE();
1277    v1 = vec_splat_u16(1);
1278    v8 = vec_splat_u16(8);
1279
1280    /* set the alpha to 255 on the destination surf */
1281    valphamask = VEC_ALPHA_MASK();
1282
1283    vsrcPermute = calc_swizzle32(srcfmt, NULL);
1284    vdstPermute = calc_swizzle32(NULL, dstfmt);
1285    vsdstPermute = calc_swizzle32(dstfmt, NULL);
1286
1287    /* set a vector full of alpha and 255-alpha */
1288    ((unsigned char *)&valpha)[0] = alpha;
1289    valpha = vec_splat(valpha, 0);
1290    vbits = (vector unsigned char)vec_splat_s8(-1);
1291
1292    while(height--) {
1293        int width = info->d_width;
1294#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1295            Uint32 Pixel; \
1296            unsigned sR, sG, sB, dR, dG, dB; \
1297            DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
1298            DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1299            ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1300            ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1301            ++srcp; \
1302            ++dstp; \
1303            widthvar--; \
1304        }
1305        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1306        if (width > 0) {
1307            int extrawidth = (width % 4);
1308            vector unsigned char valigner = VEC_ALIGNER(srcp);
1309            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1310            width -= extrawidth;
1311            while (width) {
1312                vector unsigned char voverflow;
1313                vector unsigned char vd;
1314
1315                /* s = *srcp */
1316                voverflow = (vector unsigned char)vec_ld(15, srcp);
1317                vs = vec_perm(vs, voverflow, valigner);
1318                vs = vec_perm(vs, valpha, vsrcPermute);
1319
1320                /* d = *dstp */
1321                vd = (vector unsigned char)vec_ld(0, dstp);
1322                vd = vec_perm(vd, vd, vsdstPermute);
1323
1324                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1325
1326                /* set the alpha channel to full on */
1327                vd = vec_or(vd, valphamask);
1328                vd = vec_perm(vd, vbits, vdstPermute);
1329
1330                /* *dstp = res */
1331                vec_st((vector unsigned int)vd, 0, dstp);
1332
1333                srcp += 4;
1334                dstp += 4;
1335                width -= 4;
1336                vs = voverflow;
1337            }
1338            ONE_PIXEL_BLEND((extrawidth), extrawidth);
1339        }
1340#undef ONE_PIXEL_BLEND
1341
1342        srcp += srcskip;
1343        dstp += dstskip;
1344    }
1345
1346}
1347
1348
1349/* fast RGB888->(A)RGB888 blending */
1350static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
1351{
1352	unsigned alpha = info->src->alpha;
1353    int height = info->d_height;
1354    Uint32 *srcp = (Uint32 *)info->s_pixels;
1355    int srcskip = info->s_skip >> 2;
1356    Uint32 *dstp = (Uint32 *)info->d_pixels;
1357    int dstskip = info->d_skip >> 2;
1358    vector unsigned char mergePermute;
1359    vector unsigned char valpha;
1360    vector unsigned char valphamask;
1361    vector unsigned short v1;
1362    vector unsigned short v8;
1363
1364    mergePermute = VEC_MERGE_PERMUTE();
1365    v1 = vec_splat_u16(1);
1366    v8 = vec_splat_u16(8);
1367
1368    /* set the alpha to 255 on the destination surf */
1369    valphamask = VEC_ALPHA_MASK();
1370
1371    /* set a vector full of alpha and 255-alpha */
1372    ((unsigned char *)&valpha)[0] = alpha;
1373    valpha = vec_splat(valpha, 0);
1374
1375    while(height--) {
1376        int width = info->d_width;
1377#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1378            Uint32 s = *srcp; \
1379            Uint32 d = *dstp; \
1380            Uint32 s1 = s & 0xff00ff; \
1381            Uint32 d1 = d & 0xff00ff; \
1382            d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
1383                 & 0xff00ff; \
1384            s &= 0xff00; \
1385            d &= 0xff00; \
1386            d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1387            *dstp = d1 | d | 0xff000000; \
1388            ++srcp; \
1389            ++dstp; \
1390            widthvar--; \
1391        }
1392        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1393        if (width > 0) {
1394            int extrawidth = (width % 4);
1395            vector unsigned char valigner = VEC_ALIGNER(srcp);
1396            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1397            width -= extrawidth;
1398            while (width) {
1399                vector unsigned char voverflow;
1400                vector unsigned char vd;
1401
1402                /* s = *srcp */
1403                voverflow = (vector unsigned char)vec_ld(15, srcp);
1404                vs = vec_perm(vs, voverflow, valigner);
1405
1406                /* d = *dstp */
1407                vd = (vector unsigned char)vec_ld(0, dstp);
1408
1409                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1410
1411                /* set the alpha channel to full on */
1412                vd = vec_or(vd, valphamask);
1413
1414                /* *dstp = res */
1415                vec_st((vector unsigned int)vd, 0, dstp);
1416
1417                srcp += 4;
1418                dstp += 4;
1419                width -= 4;
1420                vs = voverflow;
1421            }
1422            ONE_PIXEL_BLEND((extrawidth), extrawidth);
1423        }
1424#undef ONE_PIXEL_BLEND
1425
1426        srcp += srcskip;
1427        dstp += dstskip;
1428    }
1429}
1430#if __MWERKS__
1431#pragma altivec_model off
1432#endif
1433#endif /* SDL_ALTIVEC_BLITTERS */
1434
1435/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
1436static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
1437{
1438	int width = info->d_width;
1439	int height = info->d_height;
1440	Uint32 *srcp = (Uint32 *)info->s_pixels;
1441	int srcskip = info->s_skip >> 2;
1442	Uint32 *dstp = (Uint32 *)info->d_pixels;
1443	int dstskip = info->d_skip >> 2;
1444
1445	while(height--) {
1446	    DUFFS_LOOP4({
1447		    Uint32 s = *srcp++;
1448		    Uint32 d = *dstp;
1449		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
1450			       + (s & d & 0x00010101)) | 0xff000000;
1451	    }, width);
1452	    srcp += srcskip;
1453	    dstp += dstskip;
1454	}
1455}
1456
1457/* fast RGB888->(A)RGB888 blending with surface alpha */
1458static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
1459{
1460	unsigned alpha = info->src->alpha;
1461	if(alpha == 128) {
1462		BlitRGBtoRGBSurfaceAlpha128(info);
1463	} else {
1464		int width = info->d_width;
1465		int height = info->d_height;
1466		Uint32 *srcp = (Uint32 *)info->s_pixels;
1467		int srcskip = info->s_skip >> 2;
1468		Uint32 *dstp = (Uint32 *)info->d_pixels;
1469		int dstskip = info->d_skip >> 2;
1470		Uint32 s;
1471		Uint32 d;
1472		Uint32 s1;
1473		Uint32 d1;
1474
1475		while(height--) {
1476			DUFFS_LOOP_DOUBLE2({
1477				/* One Pixel Blend */
1478				s = *srcp;
1479				d = *dstp;
1480				s1 = s & 0xff00ff;
1481				d1 = d & 0xff00ff;
1482				d1 = (d1 + ((s1 - d1) * alpha >> 8))
1483				     & 0xff00ff;
1484				s &= 0xff00;
1485				d &= 0xff00;
1486				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1487				*dstp = d1 | d | 0xff000000;
1488				++srcp;
1489				++dstp;
1490			},{
1491			        /* Two Pixels Blend */
1492				s = *srcp;
1493				d = *dstp;
1494				s1 = s & 0xff00ff;
1495				d1 = d & 0xff00ff;
1496				d1 += (s1 - d1) * alpha >> 8;
1497				d1 &= 0xff00ff;
1498
1499				s = ((s & 0xff00) >> 8) |
1500					((srcp[1] & 0xff00) << 8);
1501				d = ((d & 0xff00) >> 8) |
1502					((dstp[1] & 0xff00) << 8);
1503				d += (s - d) * alpha >> 8;
1504				d &= 0x00ff00ff;
1505
1506				*dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
1507				++srcp;
1508
1509			        s1 = *srcp;
1510				d1 = *dstp;
1511				s1 &= 0xff00ff;
1512				d1 &= 0xff00ff;
1513				d1 += (s1 - d1) * alpha >> 8;
1514				d1 &= 0xff00ff;
1515
1516				*dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
1517				++srcp;
1518				++dstp;
1519			}, width);
1520			srcp += srcskip;
1521			dstp += dstskip;
1522		}
1523	}
1524}
1525
1526/* fast ARGB888->(A)RGB888 blending with pixel alpha */
1527static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
1528{
1529	int width = info->d_width;
1530	int height = info->d_height;
1531	Uint32 *srcp = (Uint32 *)info->s_pixels;
1532	int srcskip = info->s_skip >> 2;
1533	Uint32 *dstp = (Uint32 *)info->d_pixels;
1534	int dstskip = info->d_skip >> 2;
1535
1536	while(height--) {
1537	    DUFFS_LOOP4({
1538		Uint32 dalpha;
1539		Uint32 d;
1540		Uint32 s1;
1541		Uint32 d1;
1542		Uint32 s = *srcp;
1543		Uint32 alpha = s >> 24;
1544		/* FIXME: Here we special-case opaque alpha since the
1545		   compositioning used (>>8 instead of /255) doesn't handle
1546		   it correctly. Also special-case alpha=0 for speed?
1547		   Benchmark this! */
1548		if(alpha) {
1549		  if(alpha == SDL_ALPHA_OPAQUE) {
1550		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
1551		  } else {
1552		    /*
1553		     * take out the middle component (green), and process
1554		     * the other two in parallel. One multiply less.
1555		     */
1556		    d = *dstp;
1557		    dalpha = d & 0xff000000;
1558		    s1 = s & 0xff00ff;
1559		    d1 = d & 0xff00ff;
1560		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
1561		    s &= 0xff00;
1562		    d &= 0xff00;
1563		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1564		    *dstp = d1 | d | dalpha;
1565		  }
1566		}
1567		++srcp;
1568		++dstp;
1569	    }, width);
1570	    srcp += srcskip;
1571	    dstp += dstskip;
1572	}
1573}
1574
1575#if GCC_ASMBLIT
1576/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1577static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1578{
1579	int width = info->d_width;
1580	int height = info->d_height;
1581	Uint32 *srcp = (Uint32 *)info->s_pixels;
1582	int srcskip = info->s_skip >> 2;
1583	Uint32 *dstp = (Uint32 *)info->d_pixels;
1584	int dstskip = info->d_skip >> 2;
1585	SDL_PixelFormat* sf = info->src;
1586	Uint32 amask = sf->Amask;
1587
1588	__asm__ (
1589	/* make mm6 all zeros. */
1590	"pxor       %%mm6, %%mm6\n"
1591
1592	/* Make a mask to preserve the alpha. */
1593	"movd      %0, %%mm7\n\t"           /* 0000F000 -> mm7 */
1594	"punpcklbw %%mm7, %%mm7\n\t"        /* FF000000 -> mm7 */
1595	"pcmpeqb   %%mm4, %%mm4\n\t"        /* FFFFFFFF -> mm4 */
1596	"movq      %%mm4, %%mm3\n\t"        /* FFFFFFFF -> mm3 (for later) */
1597	"pxor      %%mm4, %%mm7\n\t"        /* 00FFFFFF -> mm7 (mult mask) */
1598
1599	/* form channel masks */
1600	"movq      %%mm7, %%mm4\n\t"        /* 00FFFFFF -> mm4 */
1601	"packsswb  %%mm6, %%mm4\n\t"        /* 00000FFF -> mm4 (channel mask) */
1602	"packsswb  %%mm6, %%mm3\n\t"        /* 0000FFFF -> mm3 */
1603	"pxor      %%mm4, %%mm3\n\t"        /* 0000F000 -> mm3 (~channel mask) */
1604
1605	/* get alpha channel shift */
1606	"movd      %1, %%mm5\n\t" /* Ashift -> mm5 */
1607
1608	  : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
1609
1610	while(height--) {
1611
1612	    DUFFS_LOOP4({
1613		Uint32 alpha;
1614
1615		__asm__ (
1616		"prefetch 64(%0)\n"
1617		"prefetch 64(%1)\n"
1618			: : "r" (srcp), "r" (dstp) );
1619
1620		alpha = *srcp & amask;
1621		/* FIXME: Here we special-case opaque alpha since the
1622		   compositioning used (>>8 instead of /255) doesn't handle
1623		   it correctly. Also special-case alpha=0 for speed?
1624		   Benchmark this! */
1625		if(alpha == 0) {
1626		    /* do nothing */
1627		}
1628		else if(alpha == amask) {
1629			/* opaque alpha -- copy RGB, keep dst alpha */
1630		    /* using MMX here to free up regular registers for other things */
1631			    __asm__ (
1632		    "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
1633		    "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
1634		    "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
1635		    "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
1636		    "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
1637		    "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
1638
1639		     : : "r" (srcp), "r" (dstp) );
1640		}
1641
1642		else {
1643			    __asm__ (
1644		    /* load in the source, and dst. */
1645		    "movd      (%0), %%mm0\n"		    /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
1646		    "movd      (%1), %%mm1\n"		    /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
1647
1648		    /* Move the src alpha into mm2 */
1649
1650		    /* if supporting pshufw */
1651		    /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
1652		    /*"psrlw     $8, %%mm2\n" */
1653
1654		    /* else: */
1655		    "movd       %2,    %%mm2\n"
1656		    "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
1657		    "punpcklwd	%%mm2, %%mm2\n"	            /* mm2 = 0 0 0 0 |  0 As  0  As */
1658		    "punpckldq	%%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
1659		    "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
1660
1661		    /* move the colors into words. */
1662		    "punpcklbw %%mm6, %%mm0\n"		    /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
1663		    "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
1664
1665		    /* src - dst */
1666		    "psubw    %%mm1, %%mm0\n"		    /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
1667
1668		    /* A * (src-dst) */
1669		    "pmullw    %%mm2, %%mm0\n"		    /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
1670		    "psrlw     $8,    %%mm0\n"		    /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
1671		    "paddb     %%mm1, %%mm0\n"		    /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
1672
1673		    "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
1674
1675		    "movd      %%mm0, (%1)\n"               /* result in mm0 */
1676
1677		     : : "r" (srcp), "r" (dstp), "r" (alpha) );
1678
1679		}
1680		++srcp;
1681		++dstp;
1682	    }, width);
1683	    srcp += srcskip;
1684	    dstp += dstskip;
1685	}
1686
1687	__asm__ (
1688	"emms\n"
1689		:   );
1690}
1691/* End GCC_ASMBLIT*/
1692
1693#elif MSVC_ASMBLIT
1694/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1695static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1696{
1697	int width = info->d_width;
1698	int height = info->d_height;
1699	Uint32 *srcp = (Uint32 *)info->s_pixels;
1700	int srcskip = info->s_skip >> 2;
1701	Uint32 *dstp = (Uint32 *)info->d_pixels;
1702	int dstskip = info->d_skip >> 2;
1703	SDL_PixelFormat* sf = info->src;
1704	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
1705	Uint32 amask = sf->Amask;
1706	Uint32 ashift = sf->Ashift;
1707	Uint64 multmask;
1708
1709	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
1710
1711	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1712	multmask = ~(0xFFFFi64 << (ashift * 2));
1713	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
1714
1715	while(height--) {
1716	    DUFFS_LOOP4({
1717		Uint32 alpha;
1718
1719		_m_prefetch(srcp + 16);
1720		_m_prefetch(dstp + 16);
1721
1722		alpha = *srcp & amask;
1723		if (alpha == 0) {
1724			/* do nothing */
1725		} else if (alpha == amask) {
1726			/* copy RGB, keep dst alpha */
1727			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
1728		} else {
1729			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
1730			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
1731
1732			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
1733			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
1734
1735			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
1736			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
1737			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
1738			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
1739			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
1740
1741			/* blend */
1742			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
1743			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
1744			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
1745			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
1746			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
1747
1748			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
1749		}
1750		++srcp;
1751		++dstp;
1752	    }, width);
1753	    srcp += srcskip;
1754	    dstp += dstskip;
1755	}
1756	_mm_empty();
1757}
1758/* End MSVC_ASMBLIT */
1759
1760#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
1761
1762/* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
1763
1764/* blend a single 16 bit pixel at 50% */
1765#define BLEND16_50(d, s, mask)						\
1766	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
1767
1768/* blend two 16 bit pixels at 50% */
1769#define BLEND2x16_50(d, s, mask)					     \
1770	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
1771	 + (s & d & (~(mask | mask << 16))))
1772
1773static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
1774{
1775	int width = info->d_width;
1776	int height = info->d_height;
1777	Uint16 *srcp = (Uint16 *)info->s_pixels;
1778	int srcskip = info->s_skip >> 1;
1779	Uint16 *dstp = (Uint16 *)info->d_pixels;
1780	int dstskip = info->d_skip >> 1;
1781
1782	while(height--) {
1783		if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
1784			/*
1785			 * Source and destination not aligned, pipeline it.
1786			 * This is mostly a win for big blits but no loss for
1787			 * small ones
1788			 */
1789			Uint32 prev_sw;
1790			int w = width;
1791
1792			/* handle odd destination */
1793			if((uintptr_t)dstp & 2) {
1794				Uint16 d = *dstp, s = *srcp;
1795				*dstp = BLEND16_50(d, s, mask);
1796				dstp++;
1797				srcp++;
1798				w--;
1799			}
1800			srcp++;	/* srcp is now 32-bit aligned */
1801
1802			/* bootstrap pipeline with first halfword */
1803			prev_sw = ((Uint32 *)srcp)[-1];
1804
1805			while(w > 1) {
1806				Uint32 sw, dw, s;
1807				sw = *(Uint32 *)srcp;
1808				dw = *(Uint32 *)dstp;
1809#if SDL_BYTEORDER == SDL_BIG_ENDIAN
1810				s = (prev_sw << 16) + (sw >> 16);
1811#else
1812				s = (prev_sw >> 16) + (sw << 16);
1813#endif
1814				prev_sw = sw;
1815				*(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
1816				dstp += 2;
1817				srcp += 2;
1818				w -= 2;
1819			}
1820
1821			/* final pixel if any */
1822			if(w) {
1823				Uint16 d = *dstp, s;
1824#if SDL_BYTEORDER == SDL_BIG_ENDIAN
1825				s = (Uint16)prev_sw;
1826#else
1827				s = (Uint16)(prev_sw >> 16);
1828#endif
1829				*dstp = BLEND16_50(d, s, mask);
1830				srcp++;
1831				dstp++;
1832			}
1833			srcp += srcskip - 1;
1834			dstp += dstskip;
1835		} else {
1836			/* source and destination are aligned */
1837			int w = width;
1838
1839			/* first odd pixel? */
1840			if((uintptr_t)srcp & 2) {
1841				Uint16 d = *dstp, s = *srcp;
1842				*dstp = BLEND16_50(d, s, mask);
1843				srcp++;
1844				dstp++;
1845				w--;
1846			}
1847			/* srcp and dstp are now 32-bit aligned */
1848
1849			while(w > 1) {
1850				Uint32 sw = *(Uint32 *)srcp;
1851				Uint32 dw = *(Uint32 *)dstp;
1852				*(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
1853				srcp += 2;
1854				dstp += 2;
1855				w -= 2;
1856			}
1857
1858			/* last odd pixel? */
1859			if(w) {
1860				Uint16 d = *dstp, s = *srcp;
1861				*dstp = BLEND16_50(d, s, mask);
1862				srcp++;
1863				dstp++;
1864			}
1865			srcp += srcskip;
1866			dstp += dstskip;
1867		}
1868	}
1869}
1870
1871#if GCC_ASMBLIT
1872/* fast RGB565->RGB565 blending with surface alpha */
1873static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
1874{
1875	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
1876	if(alpha == 128) {
1877		Blit16to16SurfaceAlpha128(info, 0xf7de);
1878	} else {
1879		int width = info->d_width;
1880		int height = info->d_height;
1881		Uint16 *srcp = (Uint16 *)info->s_pixels;
1882		int srcskip = info->s_skip >> 1;
1883		Uint16 *dstp = (Uint16 *)info->d_pixels;
1884		int dstskip = info->d_skip >> 1;
1885		Uint32 s, d;
1886		Uint8 load[8];
1887
1888		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
1889		*(Uint64 *)load = alpha;
1890		alpha >>= 3;		/* downscale alpha to 5 bits */
1891
1892		movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
1893		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
1894		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
1895		/* position alpha to allow for mullo and mulhi on diff channels
1896		   to reduce the number of operations */
1897		psllq_i2r(3, mm0);
1898
1899		/* Setup the 565 color channel masks */
1900		*(Uint64 *)load = 0x07E007E007E007E0ULL;
1901		movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
1902		*(Uint64 *)load = 0x001F001F001F001FULL;
1903		movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
1904		while(height--) {
1905			DUFFS_LOOP_QUATRO2(
1906			{
1907				s = *srcp++;
1908				d = *dstp;
1909				/*
1910				 * shift out the middle component (green) to
1911				 * the high 16 bits, and process all three RGB
1912				 * components at the same time.
1913				 */
1914				s = (s | s << 16) & 0x07e0f81f;
1915				d = (d | d << 16) & 0x07e0f81f;
1916				d += (s - d) * alpha >> 5;
1917				d &= 0x07e0f81f;
1918				*dstp++ = d | d >> 16;
1919			},{
1920				s = *srcp++;
1921				d = *dstp;
1922				/*
1923				 * shift out the middle component (green) to
1924				 * the high 16 bits, and process all three RGB
1925				 * components at the same time.
1926				 */
1927				s = (s | s << 16) & 0x07e0f81f;
1928				d = (d | d << 16) & 0x07e0f81f;
1929				d += (s - d) * alpha >> 5;
1930				d &= 0x07e0f81f;
1931				*dstp++ = d | d >> 16;
1932				s = *srcp++;
1933				d = *dstp;
1934				/*
1935				 * shift out the middle component (green) to
1936				 * the high 16 bits, and process all three RGB
1937				 * components at the same time.
1938				 */
1939				s = (s | s << 16) & 0x07e0f81f;
1940				d = (d | d << 16) & 0x07e0f81f;
1941				d += (s - d) * alpha >> 5;
1942				d &= 0x07e0f81f;
1943				*dstp++ = d | d >> 16;
1944			},{
1945				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
1946				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
1947
1948				/* red -- does not need a mask since the right shift clears
1949				   the uninteresting bits */
1950				movq_r2r(mm2, mm5); /* src -> mm5 */
1951				movq_r2r(mm3, mm6); /* dst -> mm6 */
1952				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
1953				psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
1954
1955				/* blend */
1956				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1957				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1958				/* alpha used is actually 11 bits
1959				   11 + 5 = 16 bits, so the sign bits are lost */
1960				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
1961				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1962				psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
1963
1964				movq_r2r(mm6, mm1); /* save new reds in dsts */
1965
1966				/* green -- process the bits in place */
1967				movq_r2r(mm2, mm5); /* src -> mm5 */
1968				movq_r2r(mm3, mm6); /* dst -> mm6 */
1969				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
1970				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
1971
1972				/* blend */
1973				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1974				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1975				/* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
1976				   bits are gone and the sign bits present */
1977				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
1978				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1979
1980				por_r2r(mm6, mm1); /* save new greens in dsts */
1981
1982				/* blue */
1983				movq_r2r(mm2, mm5); /* src -> mm5 */
1984				movq_r2r(mm3, mm6); /* dst -> mm6 */
1985				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
1986				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
1987
1988				/* blend */
1989				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1990				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1991				/* 11 + 5 = 16 bits, so the sign bits are lost and
1992				   the interesting bits will need to be MASKed */
1993				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
1994				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1995				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
1996
1997				por_r2r(mm6, mm1); /* save new blues in dsts */
1998
1999				movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
2000
2001				srcp += 4;
2002				dstp += 4;
2003			}, width);
2004			srcp += srcskip;
2005			dstp += dstskip;
2006		}
2007		emms();
2008	}
2009}
2010
2011/* fast RGB555->RGB555 blending with surface alpha */
2012static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2013{
2014	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2015	if(alpha == 128) {
2016		Blit16to16SurfaceAlpha128(info, 0xfbde);
2017	} else {
2018		int width = info->d_width;
2019		int height = info->d_height;
2020		Uint16 *srcp = (Uint16 *)info->s_pixels;
2021		int srcskip = info->s_skip >> 1;
2022		Uint16 *dstp = (Uint16 *)info->d_pixels;
2023		int dstskip = info->d_skip >> 1;
2024		Uint32 s, d;
2025		Uint8 load[8];
2026
2027		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
2028		*(Uint64 *)load = alpha;
2029		alpha >>= 3;		/* downscale alpha to 5 bits */
2030
2031		movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
2032		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
2033		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
2034		/* position alpha to allow for mullo and mulhi on diff channels
2035		   to reduce the number of operations */
2036		psllq_i2r(3, mm0);
2037
2038		/* Setup the 555 color channel masks */
2039		*(Uint64 *)load = 0x03E003E003E003E0ULL;
2040		movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
2041		*(Uint64 *)load = 0x001F001F001F001FULL;
2042		movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
2043		while(height--) {
2044			DUFFS_LOOP_QUATRO2(
2045			{
2046				s = *srcp++;
2047				d = *dstp;
2048				/*
2049				 * shift out the middle component (green) to
2050				 * the high 16 bits, and process all three RGB
2051				 * components at the same time.
2052				 */
2053				s = (s | s << 16) & 0x03e07c1f;
2054				d = (d | d << 16) & 0x03e07c1f;
2055				d += (s - d) * alpha >> 5;
2056				d &= 0x03e07c1f;
2057				*dstp++ = d | d >> 16;
2058			},{
2059				s = *srcp++;
2060				d = *dstp;
2061				/*
2062				 * shift out the middle component (green) to
2063				 * the high 16 bits, and process all three RGB
2064				 * components at the same time.
2065				 */
2066				s = (s | s << 16) & 0x03e07c1f;
2067				d = (d | d << 16) & 0x03e07c1f;
2068				d += (s - d) * alpha >> 5;
2069				d &= 0x03e07c1f;
2070				*dstp++ = d | d >> 16;
2071			        s = *srcp++;
2072				d = *dstp;
2073				/*
2074				 * shift out the middle component (green) to
2075				 * the high 16 bits, and process all three RGB
2076				 * components at the same time.
2077				 */
2078				s = (s | s << 16) & 0x03e07c1f;
2079				d = (d | d << 16) & 0x03e07c1f;
2080				d += (s - d) * alpha >> 5;
2081				d &= 0x03e07c1f;
2082				*dstp++ = d | d >> 16;
2083			},{
2084				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
2085				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
2086
2087				/* red -- process the bits in place */
2088				psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
2089					/* by reusing the GREEN mask we free up another mmx
2090					   register to accumulate the result */
2091
2092				movq_r2r(mm2, mm5); /* src -> mm5 */
2093				movq_r2r(mm3, mm6); /* dst -> mm6 */
2094				pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
2095				pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
2096
2097				/* blend */
2098				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2099				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2100				/* 11 + 15 - 16 = 10 bits, uninteresting bits will be
2101				   cleared by a MASK below */
2102				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2103				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2104				pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
2105
2106				psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
2107
2108				movq_r2r(mm6, mm1); /* save new reds in dsts */
2109
2110				/* green -- process the bits in place */
2111				movq_r2r(mm2, mm5); /* src -> mm5 */
2112				movq_r2r(mm3, mm6); /* dst -> mm6 */
2113				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2114				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2115
2116				/* blend */
2117				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2118				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2119				/* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
2120				   bits are gone and the sign bits present */
2121				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2122				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2123
2124				por_r2r(mm6, mm1); /* save new greens in dsts */
2125
2126				/* blue */
2127				movq_r2r(mm2, mm5); /* src -> mm5 */
2128				movq_r2r(mm3, mm6); /* dst -> mm6 */
2129				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2130				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2131
2132				/* blend */
2133				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2134				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2135				/* 11 + 5 = 16 bits, so the sign bits are lost and
2136				   the interesting bits will need to be MASKed */
2137				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2138				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2139				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2140
2141				por_r2r(mm6, mm1); /* save new blues in dsts */
2142
2143				movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
2144
2145				srcp += 4;
2146				dstp += 4;
2147			}, width);
2148			srcp += srcskip;
2149			dstp += dstskip;
2150		}
2151		emms();
2152	}
2153}
2154/* End GCC_ASMBLIT */
2155
2156#elif MSVC_ASMBLIT
2157/* fast RGB565->RGB565 blending with surface alpha */
2158static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
2159{
2160	unsigned alpha = info->src->alpha;
2161	if(alpha == 128) {
2162		Blit16to16SurfaceAlpha128(info, 0xf7de);
2163	} else {
2164		int width = info->d_width;
2165		int height = info->d_height;
2166		Uint16 *srcp = (Uint16 *)info->s_pixels;
2167		int srcskip = info->s_skip >> 1;
2168		Uint16 *dstp = (Uint16 *)info->d_pixels;
2169		int dstskip = info->d_skip >> 1;
2170		Uint32 s, d;
2171
2172		__m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
2173
2174		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
2175		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2176		alpha >>= 3;		/* downscale alpha to 5 bits */
2177
2178		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2179		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2180		/* position alpha to allow for mullo and mulhi on diff channels
2181		   to reduce the number of operations */
2182		mm_alpha = _mm_slli_si64(mm_alpha, 3);
2183
2184		/* Setup the 565 color channel masks */
2185		gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
2186		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2187
2188		while(height--) {
2189			DUFFS_LOOP_QUATRO2(
2190			{
2191				s = *srcp++;
2192				d = *dstp;
2193				/*
2194				 * shift out the middle component (green) to
2195				 * the high 16 bits, and process all three RGB
2196				 * components at the same time.
2197				 */
2198				s = (s | s << 16) & 0x07e0f81f;
2199				d = (d | d << 16) & 0x07e0f81f;
2200				d += (s - d) * alpha >> 5;
2201				d &= 0x07e0f81f;
2202				*dstp++ = (Uint16)(d | d >> 16);
2203			},{
2204				s = *srcp++;
2205				d = *dstp;
2206				/*
2207				 * shift out the middle component (green) to
2208				 * the high 16 bits, and process all three RGB
2209				 * components at the same time.
2210				 */
2211				s = (s | s << 16) & 0x07e0f81f;
2212				d = (d | d << 16) & 0x07e0f81f;
2213				d += (s - d) * alpha >> 5;
2214				d &= 0x07e0f81f;
2215				*dstp++ = (Uint16)(d | d >> 16);
2216				s = *srcp++;
2217				d = *dstp;
2218				/*
2219				 * shift out the middle component (green) to
2220				 * the high 16 bits, and process all three RGB
2221				 * components at the same time.
2222				 */
2223				s = (s | s << 16) & 0x07e0f81f;
2224				d = (d | d << 16) & 0x07e0f81f;
2225				d += (s - d) * alpha >> 5;
2226				d &= 0x07e0f81f;
2227				*dstp++ = (Uint16)(d | d >> 16);
2228			},{
2229				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2230				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2231
2232				/* red */
2233				src2 = src1;
2234				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
2235
2236				dst2 = dst1;
2237				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
2238
2239				/* blend */
2240				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2241				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2242				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2243				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2244				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
2245
2246				mm_res = dst2; /* RED -> mm_res */
2247
2248				/* green -- process the bits in place */
2249				src2 = src1;
2250				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2251
2252				dst2 = dst1;
2253				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2254
2255				/* blend */
2256				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2257				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2258				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2259				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2260
2261				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2262
2263				/* blue */
2264				src2 = src1;
2265				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2266
2267				dst2 = dst1;
2268				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2269
2270				/* blend */
2271				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2272				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2273				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2274				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2275				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2276
2277				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2278
2279				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2280
2281				srcp += 4;
2282				dstp += 4;
2283			}, width);
2284			srcp += srcskip;
2285			dstp += dstskip;
2286		}
2287		_mm_empty();
2288	}
2289}
2290
2291/* fast RGB555->RGB555 blending with surface alpha */
2292static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2293{
2294	unsigned alpha = info->src->alpha;
2295	if(alpha == 128) {
2296		Blit16to16SurfaceAlpha128(info, 0xfbde);
2297	} else {
2298		int width = info->d_width;
2299		int height = info->d_height;
2300		Uint16 *srcp = (Uint16 *)info->s_pixels;
2301		int srcskip = info->s_skip >> 1;
2302		Uint16 *dstp = (Uint16 *)info->d_pixels;
2303		int dstskip = info->d_skip >> 1;
2304		Uint32 s, d;
2305
2306		__m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
2307
2308		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
2309		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2310		alpha >>= 3;		/* downscale alpha to 5 bits */
2311
2312		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2313		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2314		/* position alpha to allow for mullo and mulhi on diff channels
2315		   to reduce the number of operations */
2316		mm_alpha = _mm_slli_si64(mm_alpha, 3);
2317
2318		/* Setup the 555 color channel masks */
2319		rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
2320		gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
2321		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2322
2323		while(height--) {
2324			DUFFS_LOOP_QUATRO2(
2325			{
2326				s = *srcp++;
2327				d = *dstp;
2328				/*
2329				 * shift out the middle component (green) to
2330				 * the high 16 bits, and process all three RGB
2331				 * components at the same time.
2332				 */
2333				s = (s | s << 16) & 0x03e07c1f;
2334				d = (d | d << 16) & 0x03e07c1f;
2335				d += (s - d) * alpha >> 5;
2336				d &= 0x03e07c1f;
2337				*dstp++ = (Uint16)(d | d >> 16);
2338			},{
2339				s = *srcp++;
2340				d = *dstp;
2341				/*
2342				 * shift out the middle component (green) to
2343				 * the high 16 bits, and process all three RGB
2344				 * components at the same time.
2345				 */
2346				s = (s | s << 16) & 0x03e07c1f;
2347				d = (d | d << 16) & 0x03e07c1f;
2348				d += (s - d) * alpha >> 5;
2349				d &= 0x03e07c1f;
2350				*dstp++ = (Uint16)(d | d >> 16);
2351			        s = *srcp++;
2352				d = *dstp;
2353				/*
2354				 * shift out the middle component (green) to
2355				 * the high 16 bits, and process all three RGB
2356				 * components at the same time.
2357				 */
2358				s = (s | s << 16) & 0x03e07c1f;
2359				d = (d | d << 16) & 0x03e07c1f;
2360				d += (s - d) * alpha >> 5;
2361				d &= 0x03e07c1f;
2362				*dstp++ = (Uint16)(d | d >> 16);
2363			},{
2364				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2365				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2366
2367				/* red -- process the bits in place */
2368				src2 = src1;
2369				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
2370
2371				dst2 = dst1;
2372				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
2373
2374				/* blend */
2375				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2376				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2377				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2378				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2379				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
2380
2381				mm_res = dst2; /* RED -> mm_res */
2382
2383				/* green -- process the bits in place */
2384				src2 = src1;
2385				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2386
2387				dst2 = dst1;
2388				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2389
2390				/* blend */
2391				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2392				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2393				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2394				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2395
2396				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2397
2398				/* blue */
2399				src2 = src1; /* src -> src2 */
2400				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2401
2402				dst2 = dst1; /* dst -> dst2 */
2403				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2404
2405				/* blend */
2406				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2407				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2408				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2409				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2410				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2411
2412				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2413
2414				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2415
2416				srcp += 4;
2417				dstp += 4;
2418			}, width);
2419			srcp += srcskip;
2420			dstp += dstskip;
2421		}
2422		_mm_empty();
2423	}
2424}
2425#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
2426
2427/* fast RGB565->RGB565 blending with surface alpha */
2428static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
2429{
2430	unsigned alpha = info->src->alpha;
2431	if(alpha == 128) {
2432		Blit16to16SurfaceAlpha128(info, 0xf7de);
2433	} else {
2434		int width = info->d_width;
2435		int height = info->d_height;
2436		Uint16 *srcp = (Uint16 *)info->s_pixels;
2437		int srcskip = info->s_skip >> 1;
2438		Uint16 *dstp = (Uint16 *)info->d_pixels;
2439		int dstskip = info->d_skip >> 1;
2440		alpha >>= 3;	/* downscale alpha to 5 bits */
2441
2442		while(height--) {
2443			DUFFS_LOOP4({
2444				Uint32 s = *srcp++;
2445				Uint32 d = *dstp;
2446				/*
2447				 * shift out the middle component (green) to
2448				 * the high 16 bits, and process all three RGB
2449				 * components at the same time.
2450				 */
2451				s = (s | s << 16) & 0x07e0f81f;
2452				d = (d | d << 16) & 0x07e0f81f;
2453				d += (s - d) * alpha >> 5;
2454				d &= 0x07e0f81f;
2455				*dstp++ = (Uint16)(d | d >> 16);
2456			}, width);
2457			srcp += srcskip;
2458			dstp += dstskip;
2459		}
2460	}
2461}
2462
2463/* fast RGB555->RGB555 blending with surface alpha */
2464static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
2465{
2466	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2467	if(alpha == 128) {
2468		Blit16to16SurfaceAlpha128(info, 0xfbde);
2469	} else {
2470		int width = info->d_width;
2471		int height = info->d_height;
2472		Uint16 *srcp = (Uint16 *)info->s_pixels;
2473		int srcskip = info->s_skip >> 1;
2474		Uint16 *dstp = (Uint16 *)info->d_pixels;
2475		int dstskip = info->d_skip >> 1;
2476		alpha >>= 3;		/* downscale alpha to 5 bits */
2477
2478		while(height--) {
2479			DUFFS_LOOP4({
2480				Uint32 s = *srcp++;
2481				Uint32 d = *dstp;
2482				/*
2483				 * shift out the middle component (green) to
2484				 * the high 16 bits, and process all three RGB
2485				 * components at the same time.
2486				 */
2487				s = (s | s << 16) & 0x03e07c1f;
2488				d = (d | d << 16) & 0x03e07c1f;
2489				d += (s - d) * alpha >> 5;
2490				d &= 0x03e07c1f;
2491				*dstp++ = (Uint16)(d | d >> 16);
2492			}, width);
2493			srcp += srcskip;
2494			dstp += dstskip;
2495		}
2496	}
2497}
2498
2499/* fast ARGB8888->RGB565 blending with pixel alpha */
2500static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
2501{
2502	int width = info->d_width;
2503	int height = info->d_height;
2504	Uint32 *srcp = (Uint32 *)info->s_pixels;
2505	int srcskip = info->s_skip >> 2;
2506	Uint16 *dstp = (Uint16 *)info->d_pixels;
2507	int dstskip = info->d_skip >> 1;
2508
2509	while(height--) {
2510	    DUFFS_LOOP4({
2511		Uint32 s = *srcp;
2512		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
2513		/* FIXME: Here we special-case opaque alpha since the
2514		   compositioning used (>>8 instead of /255) doesn't handle
2515		   it correctly. Also special-case alpha=0 for speed?
2516		   Benchmark this! */
2517		if(alpha) {
2518		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2519		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
2520		  } else {
2521		    Uint32 d = *dstp;
2522		    /*
2523		     * convert source and destination to G0RAB65565
2524		     * and blend all components at the same time
2525		     */
2526		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
2527		      + (s >> 3 & 0x1f);
2528		    d = (d | d << 16) & 0x07e0f81f;
2529		    d += (s - d) * alpha >> 5;
2530		    d &= 0x07e0f81f;
2531		    *dstp = (Uint16)(d | d >> 16);
2532		  }
2533		}
2534		srcp++;
2535		dstp++;
2536	    }, width);
2537	    srcp += srcskip;
2538	    dstp += dstskip;
2539	}
2540}
2541
2542/* fast ARGB8888->RGB555 blending with pixel alpha */
2543static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
2544{
2545	int width = info->d_width;
2546	int height = info->d_height;
2547	Uint32 *srcp = (Uint32 *)info->s_pixels;
2548	int srcskip = info->s_skip >> 2;
2549	Uint16 *dstp = (Uint16 *)info->d_pixels;
2550	int dstskip = info->d_skip >> 1;
2551
2552	while(height--) {
2553	    DUFFS_LOOP4({
2554		unsigned alpha;
2555		Uint32 s = *srcp;
2556		alpha = s >> 27; /* downscale alpha to 5 bits */
2557		/* FIXME: Here we special-case opaque alpha since the
2558		   compositioning used (>>8 instead of /255) doesn't handle
2559		   it correctly. Also special-case alpha=0 for speed?
2560		   Benchmark this! */
2561		if(alpha) {
2562		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
2563		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
2564		  } else {
2565		    Uint32 d = *dstp;
2566		    /*
2567		     * convert source and destination to G0RAB65565
2568		     * and blend all components at the same time
2569		     */
2570		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
2571		      + (s >> 3 & 0x1f);
2572		    d = (d | d << 16) & 0x03e07c1f;
2573		    d += (s - d) * alpha >> 5;
2574		    d &= 0x03e07c1f;
2575		    *dstp = (Uint16)(d | d >> 16);
2576		  }
2577		}
2578		srcp++;
2579		dstp++;
2580	    }, width);
2581	    srcp += srcskip;
2582	    dstp += dstskip;
2583	}
2584}
2585
2586/* General (slow) N->N blending with per-surface alpha */
2587static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
2588{
2589	int width = info->d_width;
2590	int height = info->d_height;
2591	Uint8 *src = info->s_pixels;
2592	int srcskip = info->s_skip;
2593	Uint8 *dst = info->d_pixels;
2594	int dstskip = info->d_skip;
2595	SDL_PixelFormat *srcfmt = info->src;
2596	SDL_PixelFormat *dstfmt = info->dst;
2597	int srcbpp = srcfmt->BytesPerPixel;
2598	int dstbpp = dstfmt->BytesPerPixel;
2599	unsigned sA = srcfmt->alpha;
2600	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2601
2602	if(sA) {
2603	  while ( height-- ) {
2604	    DUFFS_LOOP4(
2605	    {
2606		Uint32 Pixel;
2607		unsigned sR;
2608		unsigned sG;
2609		unsigned sB;
2610		unsigned dR;
2611		unsigned dG;
2612		unsigned dB;
2613		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
2614		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2615		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2616		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2617		src += srcbpp;
2618		dst += dstbpp;
2619	    },
2620	    width);
2621	    src += srcskip;
2622	    dst += dstskip;
2623	  }
2624	}
2625}
2626
2627/* General (slow) colorkeyed N->N blending with per-surface alpha */
2628static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
2629{
2630	int width = info->d_width;
2631	int height = info->d_height;
2632	Uint8 *src = info->s_pixels;
2633	int srcskip = info->s_skip;
2634	Uint8 *dst = info->d_pixels;
2635	int dstskip = info->d_skip;
2636	SDL_PixelFormat *srcfmt = info->src;
2637	SDL_PixelFormat *dstfmt = info->dst;
2638	Uint32 ckey = srcfmt->colorkey;
2639	int srcbpp = srcfmt->BytesPerPixel;
2640	int dstbpp = dstfmt->BytesPerPixel;
2641	unsigned sA = srcfmt->alpha;
2642	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
2643
2644	while ( height-- ) {
2645	    DUFFS_LOOP4(
2646	    {
2647		Uint32 Pixel;
2648		unsigned sR;
2649		unsigned sG;
2650		unsigned sB;
2651		unsigned dR;
2652		unsigned dG;
2653		unsigned dB;
2654		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
2655		if(sA && Pixel != ckey) {
2656		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
2657		    DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
2658		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2659		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2660		}
2661		src += srcbpp;
2662		dst += dstbpp;
2663	    },
2664	    width);
2665	    src += srcskip;
2666	    dst += dstskip;
2667	}
2668}
2669
2670/* General (slow) N->N blending with pixel alpha */
2671static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
2672{
2673	int width = info->d_width;
2674	int height = info->d_height;
2675	Uint8 *src = info->s_pixels;
2676	int srcskip = info->s_skip;
2677	Uint8 *dst = info->d_pixels;
2678	int dstskip = info->d_skip;
2679	SDL_PixelFormat *srcfmt = info->src;
2680	SDL_PixelFormat *dstfmt = info->dst;
2681
2682	int  srcbpp;
2683	int  dstbpp;
2684
2685	/* Set up some basic variables */
2686	srcbpp = srcfmt->BytesPerPixel;
2687	dstbpp = dstfmt->BytesPerPixel;
2688
2689	/* FIXME: for 8bpp source alpha, this doesn't get opaque values
2690	   quite right. for <8bpp source alpha, it gets them very wrong
2691	   (check all macros!)
2692	   It is unclear whether there is a good general solution that doesn't
2693	   need a branch (or a divide). */
2694	while ( height-- ) {
2695	    DUFFS_LOOP4(
2696	    {
2697		Uint32 Pixel;
2698		unsigned sR;
2699		unsigned sG;
2700		unsigned sB;
2701		unsigned dR;
2702		unsigned dG;
2703		unsigned dB;
2704		unsigned sA;
2705		unsigned dA;
2706		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
2707		if(sA) {
2708		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
2709		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
2710		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
2711		}
2712		src += srcbpp;
2713		dst += dstbpp;
2714	    },
2715	    width);
2716	    src += srcskip;
2717	    dst += dstskip;
2718	}
2719}
2720
2721
2722SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
2723{
2724    SDL_PixelFormat *sf = surface->format;
2725    SDL_PixelFormat *df = surface->map->dst->format;
2726
2727    if(sf->Amask == 0) {
2728	if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
2729	    if(df->BytesPerPixel == 1)
2730		return BlitNto1SurfaceAlphaKey;
2731	    else
2732#if SDL_ALTIVEC_BLITTERS
2733	if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
2734	    !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2735            return Blit32to32SurfaceAlphaKeyAltivec;
2736        else
2737#endif
2738            return BlitNtoNSurfaceAlphaKey;
2739	} else {
2740	    /* Per-surface alpha blits */
2741	    switch(df->BytesPerPixel) {
2742	    case 1:
2743		return BlitNto1SurfaceAlpha;
2744
2745	    case 2:
2746		if(surface->map->identity) {
2747		    if(df->Gmask == 0x7e0)
2748		    {
2749#if MMX_ASMBLIT
2750		if(SDL_HasMMX())
2751			return Blit565to565SurfaceAlphaMMX;
2752		else
2753#endif
2754			return Blit565to565SurfaceAlpha;
2755		    }
2756		    else if(df->Gmask == 0x3e0)
2757		    {
2758#if MMX_ASMBLIT
2759		if(SDL_HasMMX())
2760			return Blit555to555SurfaceAlphaMMX;
2761		else
2762#endif
2763			return Blit555to555SurfaceAlpha;
2764		    }
2765		}
2766		return BlitNtoNSurfaceAlpha;
2767
2768	    case 4:
2769		if(sf->Rmask == df->Rmask
2770		   && sf->Gmask == df->Gmask
2771		   && sf->Bmask == df->Bmask
2772		   && sf->BytesPerPixel == 4)
2773		{
2774#if MMX_ASMBLIT
2775			if(sf->Rshift % 8 == 0
2776			   && sf->Gshift % 8 == 0
2777			   && sf->Bshift % 8 == 0
2778			   && SDL_HasMMX())
2779			    return BlitRGBtoRGBSurfaceAlphaMMX;
2780#endif
2781			if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
2782			{
2783#if SDL_ALTIVEC_BLITTERS
2784				if(!(surface->map->dst->flags & SDL_HWSURFACE)
2785					&& SDL_HasAltiVec())
2786					return BlitRGBtoRGBSurfaceAlphaAltivec;
2787#endif
2788				return BlitRGBtoRGBSurfaceAlpha;
2789			}
2790		}
2791#if SDL_ALTIVEC_BLITTERS
2792		if((sf->BytesPerPixel == 4) &&
2793		   !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2794			return Blit32to32SurfaceAlphaAltivec;
2795		else
2796#endif
2797			return BlitNtoNSurfaceAlpha;
2798
2799	    case 3:
2800	    default:
2801		return BlitNtoNSurfaceAlpha;
2802	    }
2803	}
2804    } else {
2805	/* Per-pixel alpha blits */
2806	switch(df->BytesPerPixel) {
2807	case 1:
2808	    return BlitNto1PixelAlpha;
2809
2810	case 2:
2811#if SDL_ALTIVEC_BLITTERS
2812	if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
2813           df->Gmask == 0x7e0 &&
2814	   df->Bmask == 0x1f && SDL_HasAltiVec())
2815            return Blit32to565PixelAlphaAltivec;
2816        else
2817#endif
2818	    if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2819	       && sf->Gmask == 0xff00
2820	       && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
2821		   || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
2822		if(df->Gmask == 0x7e0)
2823		    return BlitARGBto565PixelAlpha;
2824		else if(df->Gmask == 0x3e0)
2825		    return BlitARGBto555PixelAlpha;
2826	    }
2827	    return BlitNtoNPixelAlpha;
2828
2829	case 4:
2830	    if(sf->Rmask == df->Rmask
2831	       && sf->Gmask == df->Gmask
2832	       && sf->Bmask == df->Bmask
2833	       && sf->BytesPerPixel == 4)
2834	    {
2835#if MMX_ASMBLIT
2836		if(sf->Rshift % 8 == 0
2837		   && sf->Gshift % 8 == 0
2838		   && sf->Bshift % 8 == 0
2839		   && sf->Ashift % 8 == 0
2840		   && sf->Aloss == 0)
2841		{
2842			if(SDL_Has3DNow())
2843				return BlitRGBtoRGBPixelAlphaMMX3DNOW;
2844			if(SDL_HasMMX())
2845				return BlitRGBtoRGBPixelAlphaMMX;
2846		}
2847#endif
2848		if(sf->Amask == 0xff000000)
2849		{
2850#if SDL_ALTIVEC_BLITTERS
2851			if(!(surface->map->dst->flags & SDL_HWSURFACE)
2852				&& SDL_HasAltiVec())
2853				return BlitRGBtoRGBPixelAlphaAltivec;
2854#endif
2855			return BlitRGBtoRGBPixelAlpha;
2856		}
2857	    }
2858#if SDL_ALTIVEC_BLITTERS
2859	    if (sf->Amask && sf->BytesPerPixel == 4 &&
2860	        !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2861		return Blit32to32PixelAlphaAltivec;
2862	    else
2863#endif
2864		return BlitNtoNPixelAlpha;
2865
2866	case 3:
2867	default:
2868	    return BlitNtoNPixelAlpha;
2869	}
2870    }
2871}
2872
2873