SDL_RLEaccel.c revision 9682c8870b8ff5e4ac2e4c70b759f791c6f38c1f
1/*
2    SDL - Simple DirectMedia Layer
3    Copyright (C) 1997-2012 Sam Lantinga
4
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
9
10    This library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public
16    License along with this library; if not, write to the Free Software
17    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
18
19    Sam Lantinga
20    slouken@libsdl.org
21*/
22#include "SDL_config.h"
23
24/*
25 * RLE encoding for software colorkey and alpha-channel acceleration
26 *
27 * Original version by Sam Lantinga
28 *
29 * Mattias Engdeg�rd (Yorick): Rewrite. New encoding format, encoder and
30 * decoder. Added per-surface alpha blitter. Added per-pixel alpha
31 * format, encoder and blitter.
32 *
33 * Many thanks to Xark and johns for hints, benchmarks and useful comments
34 * leading to this code.
35 *
36 * Welcome to Macro Mayhem.
37 */
38
39/*
40 * The encoding translates the image data to a stream of segments of the form
41 *
42 * <skip> <run> <data>
43 *
44 * where <skip> is the number of transparent pixels to skip,
45 *       <run>  is the number of opaque pixels to blit,
46 * and   <data> are the pixels themselves.
47 *
48 * This basic structure is used both for colorkeyed surfaces, used for simple
49 * binary transparency and for per-surface alpha blending, and for surfaces
50 * with per-pixel alpha. The details differ, however:
51 *
52 * Encoding of colorkeyed surfaces:
53 *
54 *   Encoded pixels always have the same format as the target surface.
55 *   <skip> and <run> are unsigned 8 bit integers, except for 32 bit depth
56 *   where they are 16 bit. This makes the pixel data aligned at all times.
57 *   Segments never wrap around from one scan line to the next.
58 *
59 *   The end of the sequence is marked by a zero <skip>,<run> pair at the *
60 *   beginning of a line.
61 *
62 * Encoding of surfaces with per-pixel alpha:
63 *
64 *   The sequence begins with a struct RLEDestFormat describing the target
65 *   pixel format, to provide reliable un-encoding.
66 *
67 *   Each scan line is encoded twice: First all completely opaque pixels,
68 *   encoded in the target format as described above, and then all
69 *   partially transparent (translucent) pixels (where 1 <= alpha <= 254),
70 *   in the following 32-bit format:
71 *
72 *   For 32-bit targets, each pixel has the target RGB format but with
73 *   the alpha value occupying the highest 8 bits. The <skip> and <run>
74 *   counts are 16 bit.
75 *
76 *   For 16-bit targets, each pixel has the target RGB format, but with
77 *   the middle component (usually green) shifted 16 steps to the left,
78 *   and the hole filled with the 5 most significant bits of the alpha value.
79 *   i.e. if the target has the format         rrrrrggggggbbbbb,
80 *   the encoded pixel will be 00000gggggg00000rrrrr0aaaaabbbbb.
81 *   The <skip> and <run> counts are 8 bit for the opaque lines, 16 bit
82 *   for the translucent lines. Two padding bytes may be inserted
83 *   before each translucent line to keep them 32-bit aligned.
84 *
85 *   The end of the sequence is marked by a zero <skip>,<run> pair at the
86 *   beginning of an opaque line.
87 */
88
89#include "SDL_video.h"
90#include "SDL_sysvideo.h"
91#include "SDL_blit.h"
92#include "SDL_RLEaccel_c.h"
93
94/* Force MMX to 0; this blows up on almost every major compiler now. --ryan. */
95#if 0 && defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES
96#define MMX_ASMBLIT
97#endif
98
99#ifdef MMX_ASMBLIT
100#include "mmx.h"
101#include "SDL_cpuinfo.h"
102#endif
103
104#ifndef MAX
105#define MAX(a, b) ((a) > (b) ? (a) : (b))
106#endif
107#ifndef MIN
108#define MIN(a, b) ((a) < (b) ? (a) : (b))
109#endif
110
111#define PIXEL_COPY(to, from, len, bpp)			\
112do {							\
113    if(bpp == 4) {					\
114	SDL_memcpy4(to, from, (size_t)(len));		\
115    } else {						\
116	SDL_memcpy(to, from, (size_t)(len) * (bpp));	\
117    }							\
118} while(0)
119
120/*
121 * Various colorkey blit methods, for opaque and per-surface alpha
122 */
123
124#define OPAQUE_BLIT(to, from, length, bpp, alpha)	\
125    PIXEL_COPY(to, from, length, bpp)
126
127#ifdef MMX_ASMBLIT
128
129#define ALPHA_BLIT32_888MMX(to, from, length, bpp, alpha)	\
130    do {							\
131	Uint32 *srcp = (Uint32 *)(from);			\
132	Uint32 *dstp = (Uint32 *)(to);				\
133        int i = 0x00FF00FF;					\
134        movd_m2r(*(&i), mm3);					\
135        punpckldq_r2r(mm3, mm3);				\
136        i = 0xFF000000;						\
137        movd_m2r(*(&i), mm7);					\
138        punpckldq_r2r(mm7, mm7);				\
139        i = alpha | alpha << 16;				\
140        movd_m2r(*(&i), mm4);					\
141        punpckldq_r2r(mm4, mm4);				\
142	pcmpeqd_r2r(mm5,mm5); /* set mm5 to "1" */		\
143	pxor_r2r(mm7, mm5); /* make clear alpha mask */		\
144        i = length;						\
145	if(i & 1) {						\
146          movd_m2r((*srcp), mm1); /* src -> mm1 */		\
147          punpcklbw_r2r(mm1, mm1);				\
148          pand_r2r(mm3, mm1);					\
149	  movd_m2r((*dstp), mm2); /* dst -> mm2 */		\
150          punpcklbw_r2r(mm2, mm2);				\
151          pand_r2r(mm3, mm2);					\
152	  psubw_r2r(mm2, mm1);					\
153	  pmullw_r2r(mm4, mm1);					\
154	  psrlw_i2r(8, mm1);					\
155	  paddw_r2r(mm1, mm2);					\
156	  pand_r2r(mm3, mm2);					\
157	  packuswb_r2r(mm2, mm2);				\
158	  pand_r2r(mm5, mm2); /* 00000RGB -> mm2 */		\
159	  movd_r2m(mm2, *dstp);					\
160	  ++srcp;						\
161	  ++dstp;						\
162	  i--;							\
163	}							\
164	for(; i > 0; --i) {					\
165          movq_m2r((*srcp), mm0);				\
166	  movq_r2r(mm0, mm1);					\
167          punpcklbw_r2r(mm0, mm0);				\
168	  movq_m2r((*dstp), mm2);				\
169	  punpckhbw_r2r(mm1, mm1);				\
170	  movq_r2r(mm2, mm6);					\
171          pand_r2r(mm3, mm0);					\
172          punpcklbw_r2r(mm2, mm2);				\
173	  pand_r2r(mm3, mm1);					\
174	  punpckhbw_r2r(mm6, mm6);				\
175          pand_r2r(mm3, mm2);					\
176	  psubw_r2r(mm2, mm0);					\
177	  pmullw_r2r(mm4, mm0);					\
178	  pand_r2r(mm3, mm6);					\
179	  psubw_r2r(mm6, mm1);					\
180	  pmullw_r2r(mm4, mm1);					\
181	  psrlw_i2r(8, mm0);					\
182	  paddw_r2r(mm0, mm2);					\
183	  psrlw_i2r(8, mm1);					\
184	  paddw_r2r(mm1, mm6);					\
185	  pand_r2r(mm3, mm2);					\
186	  pand_r2r(mm3, mm6);					\
187	  packuswb_r2r(mm2, mm2);				\
188	  packuswb_r2r(mm6, mm6);				\
189	  psrlq_i2r(32, mm2);					\
190	  psllq_i2r(32, mm6);					\
191	  por_r2r(mm6, mm2);					\
192	  pand_r2r(mm5, mm2); /* 00000RGB -> mm2 */		\
193         movq_r2m(mm2, *dstp);					\
194	  srcp += 2;						\
195	  dstp += 2;						\
196	  i--;							\
197	}							\
198	emms();							\
199    } while(0)
200
201#define ALPHA_BLIT16_565MMX(to, from, length, bpp, alpha)	\
202    do {						\
203        int i, n = 0;					\
204	Uint16 *srcp = (Uint16 *)(from);		\
205	Uint16 *dstp = (Uint16 *)(to);			\
206        Uint32 ALPHA = 0xF800;				\
207	movd_m2r(*(&ALPHA), mm1);			\
208        punpcklwd_r2r(mm1, mm1);			\
209        punpcklwd_r2r(mm1, mm1);			\
210	ALPHA = 0x07E0;					\
211	movd_m2r(*(&ALPHA), mm4);			\
212        punpcklwd_r2r(mm4, mm4);			\
213        punpcklwd_r2r(mm4, mm4);			\
214	ALPHA = 0x001F;					\
215	movd_m2r(*(&ALPHA), mm7);			\
216        punpcklwd_r2r(mm7, mm7);			\
217        punpcklwd_r2r(mm7, mm7);			\
218	alpha &= ~(1+2+4);				\
219        i = (Uint32)alpha | (Uint32)alpha << 16;	\
220        movd_m2r(*(&i), mm0);				\
221        punpckldq_r2r(mm0, mm0);			\
222        ALPHA = alpha >> 3;				\
223        i = ((int)(length) & 3);			\
224	for(; i > 0; --i) {				\
225	    Uint32 s = *srcp++;				\
226	    Uint32 d = *dstp;				\
227	    s = (s | s << 16) & 0x07e0f81f;		\
228	    d = (d | d << 16) & 0x07e0f81f;		\
229	    d += (s - d) * ALPHA >> 5;			\
230	    d &= 0x07e0f81f;				\
231	    *dstp++ = d | d >> 16;			\
232	    n++;					\
233	}						\
234	i = (int)(length) - n;				\
235	for(; i > 0; --i) {				\
236	  movq_m2r((*dstp), mm3);			\
237	  movq_m2r((*srcp), mm2);			\
238	  movq_r2r(mm2, mm5);				\
239	  pand_r2r(mm1 , mm5);				\
240	  psrlq_i2r(11, mm5);				\
241	  movq_r2r(mm3, mm6);				\
242	  pand_r2r(mm1 , mm6);				\
243	  psrlq_i2r(11, mm6);				\
244	  psubw_r2r(mm6, mm5);				\
245	  pmullw_r2r(mm0, mm5);				\
246	  psrlw_i2r(8, mm5);				\
247	  paddw_r2r(mm5, mm6);				\
248	  psllq_i2r(11, mm6);				\
249	  pand_r2r(mm1, mm6);				\
250	  movq_r2r(mm4, mm5);				\
251	  por_r2r(mm7, mm5);				\
252	  pand_r2r(mm5, mm3);				\
253	  por_r2r(mm6, mm3);				\
254	  movq_r2r(mm2, mm5);				\
255	  pand_r2r(mm4 , mm5);				\
256	  psrlq_i2r(5, mm5);				\
257	  movq_r2r(mm3, mm6);				\
258	  pand_r2r(mm4 , mm6);				\
259	  psrlq_i2r(5, mm6);				\
260	  psubw_r2r(mm6, mm5);				\
261	  pmullw_r2r(mm0, mm5);				\
262	  psrlw_i2r(8, mm5);				\
263	  paddw_r2r(mm5, mm6);				\
264	  psllq_i2r(5, mm6);				\
265	  pand_r2r(mm4, mm6);				\
266	  movq_r2r(mm1, mm5);				\
267	  por_r2r(mm7, mm5);				\
268	  pand_r2r(mm5, mm3);				\
269	  por_r2r(mm6, mm3);				\
270	  movq_r2r(mm2, mm5);				\
271	  pand_r2r(mm7 , mm5);				\
272          movq_r2r(mm3, mm6);				\
273	  pand_r2r(mm7 , mm6);				\
274	  psubw_r2r(mm6, mm5);				\
275	  pmullw_r2r(mm0, mm5);				\
276	  psrlw_i2r(8, mm5);				\
277	  paddw_r2r(mm5, mm6);				\
278	  pand_r2r(mm7, mm6);				\
279	  movq_r2r(mm1, mm5);				\
280	  por_r2r(mm4, mm5);				\
281	  pand_r2r(mm5, mm3);				\
282	  por_r2r(mm6, mm3);				\
283	  movq_r2m(mm3, *dstp);				\
284	  srcp += 4;					\
285	  dstp += 4;					\
286	  i -= 3;					\
287	}						\
288	emms();						\
289    } while(0)
290
291#define ALPHA_BLIT16_555MMX(to, from, length, bpp, alpha)	\
292    do {						\
293        int i, n = 0;					\
294	Uint16 *srcp = (Uint16 *)(from);		\
295	Uint16 *dstp = (Uint16 *)(to);			\
296        Uint32 ALPHA = 0x7C00;				\
297	movd_m2r(*(&ALPHA), mm1);			\
298        punpcklwd_r2r(mm1, mm1);			\
299        punpcklwd_r2r(mm1, mm1);			\
300	ALPHA = 0x03E0;					\
301        movd_m2r(*(&ALPHA), mm4);			\
302        punpcklwd_r2r(mm4, mm4);			\
303        punpcklwd_r2r(mm4, mm4);			\
304	ALPHA = 0x001F;					\
305	movd_m2r(*(&ALPHA), mm7);			\
306        punpcklwd_r2r(mm7, mm7);			\
307        punpcklwd_r2r(mm7, mm7);			\
308	alpha &= ~(1+2+4);				\
309        i = (Uint32)alpha | (Uint32)alpha << 16;	\
310        movd_m2r(*(&i), mm0);				\
311        punpckldq_r2r(mm0, mm0);			\
312        i = ((int)(length) & 3);				\
313        ALPHA = alpha >> 3;				\
314	for(; i > 0; --i) {				\
315	    Uint32 s = *srcp++;				\
316	    Uint32 d = *dstp;				\
317	    s = (s | s << 16) & 0x03e07c1f;		\
318	    d = (d | d << 16) & 0x03e07c1f;		\
319	    d += (s - d) * ALPHA >> 5;			\
320	    d &= 0x03e07c1f;				\
321	    *dstp++ = d | d >> 16;			\
322	    n++;					\
323	}						\
324	i = (int)(length) - n;				\
325	for(; i > 0; --i) {				\
326	  movq_m2r((*dstp), mm3);			\
327	  movq_m2r((*srcp), mm2);			\
328	  movq_r2r(mm2, mm5);				\
329	  pand_r2r(mm1 , mm5);				\
330	  psrlq_i2r(10, mm5);				\
331	  movq_r2r(mm3, mm6);				\
332	  pand_r2r(mm1 , mm6);				\
333	  psrlq_i2r(10, mm6);				\
334	  psubw_r2r(mm6, mm5);				\
335	  pmullw_r2r(mm0, mm5);				\
336	  psrlw_i2r(8, mm5);				\
337	  paddw_r2r(mm5, mm6);				\
338	  psllq_i2r(10, mm6);				\
339	  pand_r2r(mm1, mm6);				\
340	  movq_r2r(mm4, mm5);				\
341	  por_r2r(mm7, mm5);				\
342	  pand_r2r(mm5, mm3);				\
343	  por_r2r(mm6, mm3);				\
344	  movq_r2r(mm2, mm5);				\
345	  pand_r2r(mm4 , mm5);				\
346	  psrlq_i2r(5, mm5);				\
347	  movq_r2r(mm3, mm6);				\
348	  pand_r2r(mm4 , mm6);				\
349	  psrlq_i2r(5, mm6);				\
350	  psubw_r2r(mm6, mm5);				\
351	  pmullw_r2r(mm0, mm5);				\
352	  psrlw_i2r(8, mm5);				\
353	  paddw_r2r(mm5, mm6);				\
354	  psllq_i2r(5, mm6);				\
355	  pand_r2r(mm4, mm6);				\
356	  movq_r2r(mm1, mm5);				\
357	  por_r2r(mm7, mm5);				\
358	  pand_r2r(mm5, mm3);				\
359	  por_r2r(mm6, mm3);				\
360	  movq_r2r(mm2, mm5);				\
361	  pand_r2r(mm7 , mm5);				\
362          movq_r2r(mm3, mm6);				\
363	  pand_r2r(mm7 , mm6);				\
364	  psubw_r2r(mm6, mm5);				\
365	  pmullw_r2r(mm0, mm5);				\
366	  psrlw_i2r(8, mm5);				\
367	  paddw_r2r(mm5, mm6);				\
368	  pand_r2r(mm7, mm6);				\
369	  movq_r2r(mm1, mm5);				\
370	  por_r2r(mm4, mm5);				\
371	  pand_r2r(mm5, mm3);				\
372	  por_r2r(mm6, mm3);				\
373	  movq_r2m(mm3, *dstp);				\
374	  srcp += 4;					\
375	  dstp += 4;					\
376	  i -= 3;					\
377	}						\
378	emms();						\
379    } while(0)
380
381#endif
382
383/*
384 * For 32bpp pixels on the form 0x00rrggbb:
385 * If we treat the middle component separately, we can process the two
386 * remaining in parallel. This is safe to do because of the gap to the left
387 * of each component, so the bits from the multiplication don't collide.
388 * This can be used for any RGB permutation of course.
389 */
390#define ALPHA_BLIT32_888(to, from, length, bpp, alpha)		\
391    do {							\
392        int i;							\
393	Uint32 *src = (Uint32 *)(from);				\
394	Uint32 *dst = (Uint32 *)(to);				\
395	for(i = 0; i < (int)(length); i++) {			\
396	    Uint32 s = *src++;					\
397	    Uint32 d = *dst;					\
398	    Uint32 s1 = s & 0xff00ff;				\
399	    Uint32 d1 = d & 0xff00ff;				\
400	    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;	\
401	    s &= 0xff00;					\
402	    d &= 0xff00;					\
403	    d = (d + ((s - d) * alpha >> 8)) & 0xff00;		\
404	    *dst++ = d1 | d;					\
405	}							\
406    } while(0)
407
408/*
409 * For 16bpp pixels we can go a step further: put the middle component
410 * in the high 16 bits of a 32 bit word, and process all three RGB
411 * components at the same time. Since the smallest gap is here just
412 * 5 bits, we have to scale alpha down to 5 bits as well.
413 */
414#define ALPHA_BLIT16_565(to, from, length, bpp, alpha)	\
415    do {						\
416        int i;						\
417	Uint16 *src = (Uint16 *)(from);			\
418	Uint16 *dst = (Uint16 *)(to);			\
419	Uint32 ALPHA = alpha >> 3;			\
420	for(i = 0; i < (int)(length); i++) {		\
421	    Uint32 s = *src++;				\
422	    Uint32 d = *dst;				\
423	    s = (s | s << 16) & 0x07e0f81f;		\
424	    d = (d | d << 16) & 0x07e0f81f;		\
425	    d += (s - d) * ALPHA >> 5;			\
426	    d &= 0x07e0f81f;				\
427	    *dst++ = (Uint16)(d | d >> 16);			\
428	}						\
429    } while(0)
430
431#define ALPHA_BLIT16_555(to, from, length, bpp, alpha)	\
432    do {						\
433        int i;						\
434	Uint16 *src = (Uint16 *)(from);			\
435	Uint16 *dst = (Uint16 *)(to);			\
436	Uint32 ALPHA = alpha >> 3;			\
437	for(i = 0; i < (int)(length); i++) {		\
438	    Uint32 s = *src++;				\
439	    Uint32 d = *dst;				\
440	    s = (s | s << 16) & 0x03e07c1f;		\
441	    d = (d | d << 16) & 0x03e07c1f;		\
442	    d += (s - d) * ALPHA >> 5;			\
443	    d &= 0x03e07c1f;				\
444	    *dst++ = (Uint16)(d | d >> 16);			\
445	}						\
446    } while(0)
447
448/*
449 * The general slow catch-all function, for remaining depths and formats
450 */
451#define ALPHA_BLIT_ANY(to, from, length, bpp, alpha)			\
452    do {								\
453        int i;								\
454	Uint8 *src = from;						\
455	Uint8 *dst = to;						\
456	for(i = 0; i < (int)(length); i++) {				\
457	    Uint32 s, d;						\
458	    unsigned rs, gs, bs, rd, gd, bd;				\
459	    switch(bpp) {						\
460	    case 2:							\
461		s = *(Uint16 *)src;					\
462		d = *(Uint16 *)dst;					\
463		break;							\
464	    case 3:							\
465		if(SDL_BYTEORDER == SDL_BIG_ENDIAN) {			\
466		    s = (src[0] << 16) | (src[1] << 8) | src[2];	\
467		    d = (dst[0] << 16) | (dst[1] << 8) | dst[2];	\
468		} else {						\
469		    s = (src[2] << 16) | (src[1] << 8) | src[0];	\
470		    d = (dst[2] << 16) | (dst[1] << 8) | dst[0];	\
471		}							\
472		break;							\
473	    case 4:							\
474		s = *(Uint32 *)src;					\
475		d = *(Uint32 *)dst;					\
476		break;							\
477	    }								\
478	    RGB_FROM_PIXEL(s, fmt, rs, gs, bs);				\
479	    RGB_FROM_PIXEL(d, fmt, rd, gd, bd);				\
480	    rd += (rs - rd) * alpha >> 8;				\
481	    gd += (gs - gd) * alpha >> 8;				\
482	    bd += (bs - bd) * alpha >> 8;				\
483	    PIXEL_FROM_RGB(d, fmt, rd, gd, bd);				\
484	    switch(bpp) {						\
485	    case 2:							\
486		*(Uint16 *)dst = (Uint16)d;					\
487		break;							\
488	    case 3:							\
489		if(SDL_BYTEORDER == SDL_BIG_ENDIAN) {			\
490		    dst[0] = (Uint8)(d >> 16);					\
491		    dst[1] = (Uint8)(d >> 8);					\
492		    dst[2] = (Uint8)(d);						\
493		} else {						\
494		    dst[0] = (Uint8)d;						\
495		    dst[1] = (Uint8)(d >> 8);					\
496		    dst[2] = (Uint8)(d >> 16);					\
497		}							\
498		break;							\
499	    case 4:							\
500		*(Uint32 *)dst = d;					\
501		break;							\
502	    }								\
503	    src += bpp;							\
504	    dst += bpp;							\
505	}								\
506    } while(0)
507
508#ifdef MMX_ASMBLIT
509
510#define ALPHA_BLIT32_888_50MMX(to, from, length, bpp, alpha)		\
511    do {								\
512	Uint32 *srcp = (Uint32 *)(from);				\
513	Uint32 *dstp = (Uint32 *)(to);					\
514        int i = 0x00fefefe;						\
515        movd_m2r(*(&i), mm4);						\
516        punpckldq_r2r(mm4, mm4);					\
517        i = 0x00010101;							\
518        movd_m2r(*(&i), mm3);						\
519        punpckldq_r2r(mm3, mm3);					\
520        i = (int)(length);						\
521        if( i & 1 ) {							\
522	  Uint32 s = *srcp++;						\
523	  Uint32 d = *dstp;						\
524	  *dstp++ = (((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)	\
525		     + (s & d & 0x00010101);				\
526	  i--;								\
527	}								\
528	for(; i > 0; --i) {						\
529	    movq_m2r((*dstp), mm2); /* dst -> mm2 */			\
530	    movq_r2r(mm2, mm6);	/* dst -> mm6 */			\
531	    movq_m2r((*srcp), mm1); /* src -> mm1 */			\
532	    movq_r2r(mm1, mm5);	/* src -> mm5 */			\
533	    pand_r2r(mm4, mm6);	/* dst & 0x00fefefe -> mm6 */		\
534	    pand_r2r(mm4, mm5); /* src & 0x00fefefe -> mm5 */		\
535	    paddd_r2r(mm6, mm5); /* (dst & 0x00fefefe) + (dst & 0x00fefefe) -> mm5 */	\
536	    psrld_i2r(1, mm5);						\
537	    pand_r2r(mm1, mm2);	/* s & d -> mm2 */			\
538	    pand_r2r(mm3, mm2);	/* s & d & 0x00010101 -> mm2 */		\
539	    paddd_r2r(mm5, mm2);					\
540	    movq_r2m(mm2, (*dstp));					\
541	    dstp += 2;							\
542	    srcp += 2;							\
543	    i--;							\
544	}								\
545	emms();								\
546    } while(0)
547
548#endif
549
550/*
551 * Special case: 50% alpha (alpha=128)
552 * This is treated specially because it can be optimized very well, and
553 * since it is good for many cases of semi-translucency.
554 * The theory is to do all three components at the same time:
555 * First zero the lowest bit of each component, which gives us room to
556 * add them. Then shift right and add the sum of the lowest bits.
557 */
558#define ALPHA_BLIT32_888_50(to, from, length, bpp, alpha)		\
559    do {								\
560        int i;								\
561	Uint32 *src = (Uint32 *)(from);					\
562	Uint32 *dst = (Uint32 *)(to);					\
563	for(i = 0; i < (int)(length); i++) {				\
564	    Uint32 s = *src++;						\
565	    Uint32 d = *dst;						\
566	    *dst++ = (((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)	\
567		     + (s & d & 0x00010101);				\
568	}								\
569    } while(0)
570
571/*
572 * For 16bpp, we can actually blend two pixels in parallel, if we take
573 * care to shift before we add, not after.
574 */
575
576/* helper: blend a single 16 bit pixel at 50% */
577#define BLEND16_50(dst, src, mask)			\
578    do {						\
579	Uint32 s = *src++;				\
580	Uint32 d = *dst;				\
581	*dst++ = (Uint16)((((s & mask) + (d & mask)) >> 1) +	\
582	                  (s & d & (~mask & 0xffff)));		\
583    } while(0)
584
585/* basic 16bpp blender. mask is the pixels to keep when adding. */
586#define ALPHA_BLIT16_50(to, from, length, bpp, alpha, mask)		\
587    do {								\
588	unsigned n = (length);						\
589	Uint16 *src = (Uint16 *)(from);					\
590	Uint16 *dst = (Uint16 *)(to);					\
591	if(((uintptr_t)src ^ (uintptr_t)dst) & 3) {			\
592	    /* source and destination not in phase, blit one by one */	\
593	    while(n--)							\
594		BLEND16_50(dst, src, mask);				\
595	} else {							\
596	    if((uintptr_t)src & 3) {					\
597		/* first odd pixel */					\
598		BLEND16_50(dst, src, mask);				\
599		n--;							\
600	    }								\
601	    for(; n > 1; n -= 2) {					\
602		Uint32 s = *(Uint32 *)src;				\
603		Uint32 d = *(Uint32 *)dst;				\
604		*(Uint32 *)dst = ((s & (mask | mask << 16)) >> 1)	\
605		               + ((d & (mask | mask << 16)) >> 1)	\
606		               + (s & d & (~(mask | mask << 16)));	\
607		src += 2;						\
608		dst += 2;						\
609	    }								\
610	    if(n)							\
611		BLEND16_50(dst, src, mask); /* last odd pixel */	\
612	}								\
613    } while(0)
614
615#define ALPHA_BLIT16_565_50(to, from, length, bpp, alpha)	\
616    ALPHA_BLIT16_50(to, from, length, bpp, alpha, 0xf7de)
617
618#define ALPHA_BLIT16_555_50(to, from, length, bpp, alpha)	\
619    ALPHA_BLIT16_50(to, from, length, bpp, alpha, 0xfbde)
620
621#ifdef MMX_ASMBLIT
622
623#define CHOOSE_BLIT(blitter, alpha, fmt)				\
624    do {								\
625        if(alpha == 255) {						\
626	    switch(fmt->BytesPerPixel) {				\
627	    case 1: blitter(1, Uint8, OPAQUE_BLIT); break;		\
628	    case 2: blitter(2, Uint8, OPAQUE_BLIT); break;		\
629	    case 3: blitter(3, Uint8, OPAQUE_BLIT); break;		\
630	    case 4: blitter(4, Uint16, OPAQUE_BLIT); break;		\
631	    }								\
632	} else {							\
633	    switch(fmt->BytesPerPixel) {				\
634	    case 1:							\
635		/* No 8bpp alpha blitting */				\
636		break;							\
637									\
638	    case 2:							\
639		switch(fmt->Rmask | fmt->Gmask | fmt->Bmask) {		\
640		case 0xffff:						\
641		    if(fmt->Gmask == 0x07e0				\
642		       || fmt->Rmask == 0x07e0				\
643		       || fmt->Bmask == 0x07e0) {			\
644			if(alpha == 128)				\
645			    blitter(2, Uint8, ALPHA_BLIT16_565_50);	\
646			else {						\
647			    if(SDL_HasMMX())				\
648				blitter(2, Uint8, ALPHA_BLIT16_565MMX);	\
649			    else					\
650				blitter(2, Uint8, ALPHA_BLIT16_565);	\
651			}						\
652		    } else						\
653			goto general16;					\
654		    break;						\
655									\
656		case 0x7fff:						\
657		    if(fmt->Gmask == 0x03e0				\
658		       || fmt->Rmask == 0x03e0				\
659		       || fmt->Bmask == 0x03e0) {			\
660			if(alpha == 128)				\
661			    blitter(2, Uint8, ALPHA_BLIT16_555_50);	\
662			else {						\
663			    if(SDL_HasMMX())				\
664				blitter(2, Uint8, ALPHA_BLIT16_555MMX);	\
665			    else					\
666				blitter(2, Uint8, ALPHA_BLIT16_555);	\
667			}						\
668			break;						\
669		    }							\
670		    /* fallthrough */					\
671									\
672		default:						\
673		general16:						\
674		    blitter(2, Uint8, ALPHA_BLIT_ANY);			\
675		}							\
676		break;							\
677									\
678	    case 3:							\
679		blitter(3, Uint8, ALPHA_BLIT_ANY);			\
680		break;							\
681									\
682	    case 4:							\
683		if((fmt->Rmask | fmt->Gmask | fmt->Bmask) == 0x00ffffff	\
684		   && (fmt->Gmask == 0xff00 || fmt->Rmask == 0xff00	\
685		       || fmt->Bmask == 0xff00)) {			\
686		    if(alpha == 128)					\
687		    {							\
688			if(SDL_HasMMX())				\
689				blitter(4, Uint16, ALPHA_BLIT32_888_50MMX);\
690			else						\
691				blitter(4, Uint16, ALPHA_BLIT32_888_50);\
692		    }							\
693		    else						\
694		    {							\
695			if(SDL_HasMMX())				\
696				blitter(4, Uint16, ALPHA_BLIT32_888MMX);\
697			else						\
698				blitter(4, Uint16, ALPHA_BLIT32_888);	\
699		    }							\
700		} else							\
701		    blitter(4, Uint16, ALPHA_BLIT_ANY);			\
702		break;							\
703	    }								\
704	}								\
705    } while(0)
706
707#else
708
709#define CHOOSE_BLIT(blitter, alpha, fmt)				\
710    do {								\
711        if(alpha == 255) {						\
712	    switch(fmt->BytesPerPixel) {				\
713	    case 1: blitter(1, Uint8, OPAQUE_BLIT); break;		\
714	    case 2: blitter(2, Uint8, OPAQUE_BLIT); break;		\
715	    case 3: blitter(3, Uint8, OPAQUE_BLIT); break;		\
716	    case 4: blitter(4, Uint16, OPAQUE_BLIT); break;		\
717	    }								\
718	} else {							\
719	    switch(fmt->BytesPerPixel) {				\
720	    case 1:							\
721		/* No 8bpp alpha blitting */				\
722		break;							\
723									\
724	    case 2:							\
725		switch(fmt->Rmask | fmt->Gmask | fmt->Bmask) {		\
726		case 0xffff:						\
727		    if(fmt->Gmask == 0x07e0				\
728		       || fmt->Rmask == 0x07e0				\
729		       || fmt->Bmask == 0x07e0) {			\
730			if(alpha == 128)				\
731			    blitter(2, Uint8, ALPHA_BLIT16_565_50);	\
732			else {						\
733			    blitter(2, Uint8, ALPHA_BLIT16_565);	\
734			}						\
735		    } else						\
736			goto general16;					\
737		    break;						\
738									\
739		case 0x7fff:						\
740		    if(fmt->Gmask == 0x03e0				\
741		       || fmt->Rmask == 0x03e0				\
742		       || fmt->Bmask == 0x03e0) {			\
743			if(alpha == 128)				\
744			    blitter(2, Uint8, ALPHA_BLIT16_555_50);	\
745			else {						\
746			    blitter(2, Uint8, ALPHA_BLIT16_555);	\
747			}						\
748			break;						\
749		    }							\
750		    /* fallthrough */					\
751									\
752		default:						\
753		general16:						\
754		    blitter(2, Uint8, ALPHA_BLIT_ANY);			\
755		}							\
756		break;							\
757									\
758	    case 3:							\
759		blitter(3, Uint8, ALPHA_BLIT_ANY);			\
760		break;							\
761									\
762	    case 4:							\
763		if((fmt->Rmask | fmt->Gmask | fmt->Bmask) == 0x00ffffff	\
764		   && (fmt->Gmask == 0xff00 || fmt->Rmask == 0xff00	\
765		       || fmt->Bmask == 0xff00)) {			\
766		    if(alpha == 128)					\
767			blitter(4, Uint16, ALPHA_BLIT32_888_50);	\
768		    else						\
769			blitter(4, Uint16, ALPHA_BLIT32_888);		\
770		} else							\
771		    blitter(4, Uint16, ALPHA_BLIT_ANY);			\
772		break;							\
773	    }								\
774	}								\
775    } while(0)
776
777#endif
778
779/*
780 * This takes care of the case when the surface is clipped on the left and/or
781 * right. Top clipping has already been taken care of.
782 */
783static void RLEClipBlit(int w, Uint8 *srcbuf, SDL_Surface *dst,
784			Uint8 *dstbuf, SDL_Rect *srcrect, unsigned alpha)
785{
786    SDL_PixelFormat *fmt = dst->format;
787
788#define RLECLIPBLIT(bpp, Type, do_blit)					   \
789    do {								   \
790	int linecount = srcrect->h;					   \
791	int ofs = 0;							   \
792	int left = srcrect->x;						   \
793	int right = left + srcrect->w;					   \
794	dstbuf -= left * bpp;						   \
795	for(;;) {							   \
796	    int run;							   \
797	    ofs += *(Type *)srcbuf;					   \
798	    run = ((Type *)srcbuf)[1];					   \
799	    srcbuf += 2 * sizeof(Type);					   \
800	    if(run) {							   \
801		/* clip to left and right borders */			   \
802		if(ofs < right) {					   \
803		    int start = 0;					   \
804		    int len = run;					   \
805		    int startcol;					   \
806		    if(left - ofs > 0) {				   \
807			start = left - ofs;				   \
808			len -= start;					   \
809			if(len <= 0)					   \
810			    goto nocopy ## bpp ## do_blit;		   \
811		    }							   \
812		    startcol = ofs + start;				   \
813		    if(len > right - startcol)				   \
814			len = right - startcol;				   \
815		    do_blit(dstbuf + startcol * bpp, srcbuf + start * bpp, \
816			    len, bpp, alpha);				   \
817		}							   \
818	    nocopy ## bpp ## do_blit:					   \
819		srcbuf += run * bpp;					   \
820		ofs += run;						   \
821	    } else if(!ofs)						   \
822		break;							   \
823	    if(ofs == w) {						   \
824		ofs = 0;						   \
825		dstbuf += dst->pitch;					   \
826		if(!--linecount)					   \
827		    break;						   \
828	    }								   \
829	}								   \
830    } while(0)
831
832    CHOOSE_BLIT(RLECLIPBLIT, alpha, fmt);
833
834#undef RLECLIPBLIT
835
836}
837
838
839/* blit a colorkeyed RLE surface */
840int SDL_RLEBlit(SDL_Surface *src, SDL_Rect *srcrect,
841		SDL_Surface *dst, SDL_Rect *dstrect)
842{
843	Uint8 *dstbuf;
844	Uint8 *srcbuf;
845	int x, y;
846	int w = src->w;
847	unsigned alpha;
848
849	/* Lock the destination if necessary */
850	if ( SDL_MUSTLOCK(dst) ) {
851		if ( SDL_LockSurface(dst) < 0 ) {
852			return(-1);
853		}
854	}
855
856	/* Set up the source and destination pointers */
857	x = dstrect->x;
858	y = dstrect->y;
859	dstbuf = (Uint8 *)dst->pixels
860	         + y * dst->pitch + x * src->format->BytesPerPixel;
861	srcbuf = (Uint8 *)src->map->sw_data->aux_data;
862
863	{
864	    /* skip lines at the top if neccessary */
865	    int vskip = srcrect->y;
866	    int ofs = 0;
867	    if(vskip) {
868
869#define RLESKIP(bpp, Type)			\
870		for(;;) {			\
871		    int run;			\
872		    ofs += *(Type *)srcbuf;	\
873		    run = ((Type *)srcbuf)[1];	\
874		    srcbuf += sizeof(Type) * 2;	\
875		    if(run) {			\
876			srcbuf += run * bpp;	\
877			ofs += run;		\
878		    } else if(!ofs)		\
879			goto done;		\
880		    if(ofs == w) {		\
881			ofs = 0;		\
882			if(!--vskip)		\
883			    break;		\
884		    }				\
885		}
886
887		switch(src->format->BytesPerPixel) {
888		case 1: RLESKIP(1, Uint8); break;
889		case 2: RLESKIP(2, Uint8); break;
890		case 3: RLESKIP(3, Uint8); break;
891		case 4: RLESKIP(4, Uint16); break;
892		}
893
894#undef RLESKIP
895
896	    }
897	}
898
899	alpha = (src->flags & SDL_SRCALPHA) == SDL_SRCALPHA
900	        ? src->format->alpha : 255;
901	/* if left or right edge clipping needed, call clip blit */
902	if ( srcrect->x || srcrect->w != src->w ) {
903	    RLEClipBlit(w, srcbuf, dst, dstbuf, srcrect, alpha);
904	} else {
905	    SDL_PixelFormat *fmt = src->format;
906
907#define RLEBLIT(bpp, Type, do_blit)					      \
908	    do {							      \
909		int linecount = srcrect->h;				      \
910		int ofs = 0;						      \
911		for(;;) {						      \
912		    unsigned run;					      \
913		    ofs += *(Type *)srcbuf;				      \
914		    run = ((Type *)srcbuf)[1];				      \
915		    srcbuf += 2 * sizeof(Type);				      \
916		    if(run) {						      \
917			do_blit(dstbuf + ofs * bpp, srcbuf, run, bpp, alpha); \
918			srcbuf += run * bpp;				      \
919			ofs += run;					      \
920		    } else if(!ofs)					      \
921			break;						      \
922		    if(ofs == w) {					      \
923			ofs = 0;					      \
924			dstbuf += dst->pitch;				      \
925			if(!--linecount)				      \
926			    break;					      \
927		    }							      \
928		}							      \
929	    } while(0)
930
931	    CHOOSE_BLIT(RLEBLIT, alpha, fmt);
932
933#undef RLEBLIT
934	}
935
936done:
937	/* Unlock the destination if necessary */
938	if ( SDL_MUSTLOCK(dst) ) {
939		SDL_UnlockSurface(dst);
940	}
941	return(0);
942}
943
944#undef OPAQUE_BLIT
945
946/*
947 * Per-pixel blitting macros for translucent pixels:
948 * These use the same techniques as the per-surface blitting macros
949 */
950
951/*
952 * For 32bpp pixels, we have made sure the alpha is stored in the top
953 * 8 bits, so proceed as usual
954 */
955#define BLIT_TRANSL_888(src, dst)				\
956    do {							\
957        Uint32 s = src;						\
958	Uint32 d = dst;						\
959	unsigned alpha = s >> 24;				\
960	Uint32 s1 = s & 0xff00ff;				\
961	Uint32 d1 = d & 0xff00ff;				\
962	d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;	\
963	s &= 0xff00;						\
964	d &= 0xff00;						\
965	d = (d + ((s - d) * alpha >> 8)) & 0xff00;		\
966	dst = d1 | d;						\
967    } while(0)
968
969/*
970 * For 16bpp pixels, we have stored the 5 most significant alpha bits in
971 * bits 5-10. As before, we can process all 3 RGB components at the same time.
972 */
973#define BLIT_TRANSL_565(src, dst)		\
974    do {					\
975	Uint32 s = src;				\
976	Uint32 d = dst;				\
977	unsigned alpha = (s & 0x3e0) >> 5;	\
978	s &= 0x07e0f81f;			\
979	d = (d | d << 16) & 0x07e0f81f;		\
980	d += (s - d) * alpha >> 5;		\
981	d &= 0x07e0f81f;			\
982	dst = (Uint16)(d | d >> 16);			\
983    } while(0)
984
985#define BLIT_TRANSL_555(src, dst)		\
986    do {					\
987	Uint32 s = src;				\
988	Uint32 d = dst;				\
989	unsigned alpha = (s & 0x3e0) >> 5;	\
990	s &= 0x03e07c1f;			\
991	d = (d | d << 16) & 0x03e07c1f;		\
992	d += (s - d) * alpha >> 5;		\
993	d &= 0x03e07c1f;			\
994	dst = (Uint16)(d | d >> 16);			\
995    } while(0)
996
997/* used to save the destination format in the encoding. Designed to be
998   macro-compatible with SDL_PixelFormat but without the unneeded fields */
999typedef struct {
1000	Uint8  BytesPerPixel;
1001	Uint8  Rloss;
1002	Uint8  Gloss;
1003	Uint8  Bloss;
1004	Uint8  Rshift;
1005	Uint8  Gshift;
1006	Uint8  Bshift;
1007	Uint8  Ashift;
1008	Uint32 Rmask;
1009	Uint32 Gmask;
1010	Uint32 Bmask;
1011	Uint32 Amask;
1012} RLEDestFormat;
1013
1014/* blit a pixel-alpha RLE surface clipped at the right and/or left edges */
1015static void RLEAlphaClipBlit(int w, Uint8 *srcbuf, SDL_Surface *dst,
1016			     Uint8 *dstbuf, SDL_Rect *srcrect)
1017{
1018    SDL_PixelFormat *df = dst->format;
1019    /*
1020     * clipped blitter: Ptype is the destination pixel type,
1021     * Ctype the translucent count type, and do_blend the macro
1022     * to blend one pixel.
1023     */
1024#define RLEALPHACLIPBLIT(Ptype, Ctype, do_blend)			  \
1025    do {								  \
1026	int linecount = srcrect->h;					  \
1027	int left = srcrect->x;						  \
1028	int right = left + srcrect->w;					  \
1029	dstbuf -= left * sizeof(Ptype);					  \
1030	do {								  \
1031	    int ofs = 0;						  \
1032	    /* blit opaque pixels on one line */			  \
1033	    do {							  \
1034		unsigned run;						  \
1035		ofs += ((Ctype *)srcbuf)[0];				  \
1036		run = ((Ctype *)srcbuf)[1];				  \
1037		srcbuf += 2 * sizeof(Ctype);				  \
1038		if(run) {						  \
1039		    /* clip to left and right borders */		  \
1040		    int cofs = ofs;					  \
1041		    int crun = run;					  \
1042		    if(left - cofs > 0) {				  \
1043			crun -= left - cofs;				  \
1044			cofs = left;					  \
1045		    }							  \
1046		    if(crun > right - cofs)				  \
1047			crun = right - cofs;				  \
1048		    if(crun > 0)					  \
1049			PIXEL_COPY(dstbuf + cofs * sizeof(Ptype),	  \
1050				   srcbuf + (cofs - ofs) * sizeof(Ptype), \
1051				   (unsigned)crun, sizeof(Ptype));	  \
1052		    srcbuf += run * sizeof(Ptype);			  \
1053		    ofs += run;						  \
1054		} else if(!ofs)						  \
1055		    return;						  \
1056	    } while(ofs < w);						  \
1057	    /* skip padding if necessary */				  \
1058	    if(sizeof(Ptype) == 2)					  \
1059		srcbuf += (uintptr_t)srcbuf & 2;			  \
1060	    /* blit translucent pixels on the same line */		  \
1061	    ofs = 0;							  \
1062	    do {							  \
1063		unsigned run;						  \
1064		ofs += ((Uint16 *)srcbuf)[0];				  \
1065		run = ((Uint16 *)srcbuf)[1];				  \
1066		srcbuf += 4;						  \
1067		if(run) {						  \
1068		    /* clip to left and right borders */		  \
1069		    int cofs = ofs;					  \
1070		    int crun = run;					  \
1071		    if(left - cofs > 0) {				  \
1072			crun -= left - cofs;				  \
1073			cofs = left;					  \
1074		    }							  \
1075		    if(crun > right - cofs)				  \
1076			crun = right - cofs;				  \
1077		    if(crun > 0) {					  \
1078			Ptype *dst = (Ptype *)dstbuf + cofs;		  \
1079			Uint32 *src = (Uint32 *)srcbuf + (cofs - ofs);	  \
1080			int i;						  \
1081			for(i = 0; i < crun; i++)			  \
1082			    do_blend(src[i], dst[i]);			  \
1083		    }							  \
1084		    srcbuf += run * 4;					  \
1085		    ofs += run;						  \
1086		}							  \
1087	    } while(ofs < w);						  \
1088	    dstbuf += dst->pitch;					  \
1089	} while(--linecount);						  \
1090    } while(0)
1091
1092    switch(df->BytesPerPixel) {
1093    case 2:
1094	if(df->Gmask == 0x07e0 || df->Rmask == 0x07e0
1095	   || df->Bmask == 0x07e0)
1096	    RLEALPHACLIPBLIT(Uint16, Uint8, BLIT_TRANSL_565);
1097	else
1098	    RLEALPHACLIPBLIT(Uint16, Uint8, BLIT_TRANSL_555);
1099	break;
1100    case 4:
1101	RLEALPHACLIPBLIT(Uint32, Uint16, BLIT_TRANSL_888);
1102	break;
1103    }
1104}
1105
1106/* blit a pixel-alpha RLE surface */
1107int SDL_RLEAlphaBlit(SDL_Surface *src, SDL_Rect *srcrect,
1108		     SDL_Surface *dst, SDL_Rect *dstrect)
1109{
1110    int x, y;
1111    int w = src->w;
1112    Uint8 *srcbuf, *dstbuf;
1113    SDL_PixelFormat *df = dst->format;
1114
1115    /* Lock the destination if necessary */
1116    if ( SDL_MUSTLOCK(dst) ) {
1117	if ( SDL_LockSurface(dst) < 0 ) {
1118	    return -1;
1119	}
1120    }
1121
1122    x = dstrect->x;
1123    y = dstrect->y;
1124    dstbuf = (Uint8 *)dst->pixels
1125	     + y * dst->pitch + x * df->BytesPerPixel;
1126    srcbuf = (Uint8 *)src->map->sw_data->aux_data + sizeof(RLEDestFormat);
1127
1128    {
1129	/* skip lines at the top if necessary */
1130	int vskip = srcrect->y;
1131	if(vskip) {
1132	    int ofs;
1133	    if(df->BytesPerPixel == 2) {
1134		/* the 16/32 interleaved format */
1135		do {
1136		    /* skip opaque line */
1137		    ofs = 0;
1138		    do {
1139			int run;
1140			ofs += srcbuf[0];
1141			run = srcbuf[1];
1142			srcbuf += 2;
1143			if(run) {
1144			    srcbuf += 2 * run;
1145			    ofs += run;
1146			} else if(!ofs)
1147			    goto done;
1148		    } while(ofs < w);
1149
1150		    /* skip padding */
1151		    srcbuf += (uintptr_t)srcbuf & 2;
1152
1153		    /* skip translucent line */
1154		    ofs = 0;
1155		    do {
1156			int run;
1157			ofs += ((Uint16 *)srcbuf)[0];
1158			run = ((Uint16 *)srcbuf)[1];
1159			srcbuf += 4 * (run + 1);
1160			ofs += run;
1161		    } while(ofs < w);
1162		} while(--vskip);
1163	    } else {
1164		/* the 32/32 interleaved format */
1165		vskip <<= 1;	/* opaque and translucent have same format */
1166		do {
1167		    ofs = 0;
1168		    do {
1169			int run;
1170			ofs += ((Uint16 *)srcbuf)[0];
1171			run = ((Uint16 *)srcbuf)[1];
1172			srcbuf += 4;
1173			if(run) {
1174			    srcbuf += 4 * run;
1175			    ofs += run;
1176			} else if(!ofs)
1177			    goto done;
1178		    } while(ofs < w);
1179		} while(--vskip);
1180	    }
1181	}
1182    }
1183
1184    /* if left or right edge clipping needed, call clip blit */
1185    if(srcrect->x || srcrect->w != src->w) {
1186	RLEAlphaClipBlit(w, srcbuf, dst, dstbuf, srcrect);
1187    } else {
1188
1189	/*
1190	 * non-clipped blitter. Ptype is the destination pixel type,
1191	 * Ctype the translucent count type, and do_blend the
1192	 * macro to blend one pixel.
1193	 */
1194#define RLEALPHABLIT(Ptype, Ctype, do_blend)				 \
1195	do {								 \
1196	    int linecount = srcrect->h;					 \
1197	    do {							 \
1198		int ofs = 0;						 \
1199		/* blit opaque pixels on one line */			 \
1200		do {							 \
1201		    unsigned run;					 \
1202		    ofs += ((Ctype *)srcbuf)[0];			 \
1203		    run = ((Ctype *)srcbuf)[1];				 \
1204		    srcbuf += 2 * sizeof(Ctype);			 \
1205		    if(run) {						 \
1206			PIXEL_COPY(dstbuf + ofs * sizeof(Ptype), srcbuf, \
1207				   run, sizeof(Ptype));			 \
1208			srcbuf += run * sizeof(Ptype);			 \
1209			ofs += run;					 \
1210		    } else if(!ofs)					 \
1211			goto done;					 \
1212		} while(ofs < w);					 \
1213		/* skip padding if necessary */				 \
1214		if(sizeof(Ptype) == 2)					 \
1215		    srcbuf += (uintptr_t)srcbuf & 2;		 	 \
1216		/* blit translucent pixels on the same line */		 \
1217		ofs = 0;						 \
1218		do {							 \
1219		    unsigned run;					 \
1220		    ofs += ((Uint16 *)srcbuf)[0];			 \
1221		    run = ((Uint16 *)srcbuf)[1];			 \
1222		    srcbuf += 4;					 \
1223		    if(run) {						 \
1224			Ptype *dst = (Ptype *)dstbuf + ofs;		 \
1225			unsigned i;					 \
1226			for(i = 0; i < run; i++) {			 \
1227			    Uint32 src = *(Uint32 *)srcbuf;		 \
1228			    do_blend(src, *dst);			 \
1229			    srcbuf += 4;				 \
1230			    dst++;					 \
1231			}						 \
1232			ofs += run;					 \
1233		    }							 \
1234		} while(ofs < w);					 \
1235		dstbuf += dst->pitch;					 \
1236	    } while(--linecount);					 \
1237	} while(0)
1238
1239	switch(df->BytesPerPixel) {
1240	case 2:
1241	    if(df->Gmask == 0x07e0 || df->Rmask == 0x07e0
1242	       || df->Bmask == 0x07e0)
1243		RLEALPHABLIT(Uint16, Uint8, BLIT_TRANSL_565);
1244	    else
1245		RLEALPHABLIT(Uint16, Uint8, BLIT_TRANSL_555);
1246	    break;
1247	case 4:
1248	    RLEALPHABLIT(Uint32, Uint16, BLIT_TRANSL_888);
1249	    break;
1250	}
1251    }
1252
1253 done:
1254    /* Unlock the destination if necessary */
1255    if ( SDL_MUSTLOCK(dst) ) {
1256	SDL_UnlockSurface(dst);
1257    }
1258    return 0;
1259}
1260
1261/*
1262 * Auxiliary functions:
1263 * The encoding functions take 32bpp rgb + a, and
1264 * return the number of bytes copied to the destination.
1265 * The decoding functions copy to 32bpp rgb + a, and
1266 * return the number of bytes copied from the source.
1267 * These are only used in the encoder and un-RLE code and are therefore not
1268 * highly optimised.
1269 */
1270
1271/* encode 32bpp rgb + a into 16bpp rgb, losing alpha */
1272static int copy_opaque_16(void *dst, Uint32 *src, int n,
1273			  SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
1274{
1275    int i;
1276    Uint16 *d = dst;
1277    for(i = 0; i < n; i++) {
1278	unsigned r, g, b;
1279	RGB_FROM_PIXEL(*src, sfmt, r, g, b);
1280	PIXEL_FROM_RGB(*d, dfmt, r, g, b);
1281	src++;
1282	d++;
1283    }
1284    return n * 2;
1285}
1286
1287/* decode opaque pixels from 16bpp to 32bpp rgb + a */
1288static int uncopy_opaque_16(Uint32 *dst, void *src, int n,
1289			    RLEDestFormat *sfmt, SDL_PixelFormat *dfmt)
1290{
1291    int i;
1292    Uint16 *s = src;
1293    unsigned alpha = dfmt->Amask ? 255 : 0;
1294    for(i = 0; i < n; i++) {
1295	unsigned r, g, b;
1296	RGB_FROM_PIXEL(*s, sfmt, r, g, b);
1297	PIXEL_FROM_RGBA(*dst, dfmt, r, g, b, alpha);
1298	s++;
1299	dst++;
1300    }
1301    return n * 2;
1302}
1303
1304
1305
1306/* encode 32bpp rgb + a into 32bpp G0RAB format for blitting into 565 */
1307static int copy_transl_565(void *dst, Uint32 *src, int n,
1308			   SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
1309{
1310    int i;
1311    Uint32 *d = dst;
1312    for(i = 0; i < n; i++) {
1313	unsigned r, g, b, a;
1314	Uint16 pix;
1315	RGBA_FROM_8888(*src, sfmt, r, g, b, a);
1316	PIXEL_FROM_RGB(pix, dfmt, r, g, b);
1317	*d = ((pix & 0x7e0) << 16) | (pix & 0xf81f) | ((a << 2) & 0x7e0);
1318	src++;
1319	d++;
1320    }
1321    return n * 4;
1322}
1323
1324/* encode 32bpp rgb + a into 32bpp G0RAB format for blitting into 555 */
1325static int copy_transl_555(void *dst, Uint32 *src, int n,
1326			   SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
1327{
1328    int i;
1329    Uint32 *d = dst;
1330    for(i = 0; i < n; i++) {
1331	unsigned r, g, b, a;
1332	Uint16 pix;
1333	RGBA_FROM_8888(*src, sfmt, r, g, b, a);
1334	PIXEL_FROM_RGB(pix, dfmt, r, g, b);
1335	*d = ((pix & 0x3e0) << 16) | (pix & 0xfc1f) | ((a << 2) & 0x3e0);
1336	src++;
1337	d++;
1338    }
1339    return n * 4;
1340}
1341
1342/* decode translucent pixels from 32bpp GORAB to 32bpp rgb + a */
1343static int uncopy_transl_16(Uint32 *dst, void *src, int n,
1344			    RLEDestFormat *sfmt, SDL_PixelFormat *dfmt)
1345{
1346    int i;
1347    Uint32 *s = src;
1348    for(i = 0; i < n; i++) {
1349	unsigned r, g, b, a;
1350	Uint32 pix = *s++;
1351	a = (pix & 0x3e0) >> 2;
1352	pix = (pix & ~0x3e0) | pix >> 16;
1353	RGB_FROM_PIXEL(pix, sfmt, r, g, b);
1354	PIXEL_FROM_RGBA(*dst, dfmt, r, g, b, a);
1355	dst++;
1356    }
1357    return n * 4;
1358}
1359
1360/* encode 32bpp rgba into 32bpp rgba, keeping alpha (dual purpose) */
1361static int copy_32(void *dst, Uint32 *src, int n,
1362		   SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
1363{
1364    int i;
1365    Uint32 *d = dst;
1366    for(i = 0; i < n; i++) {
1367	unsigned r, g, b, a;
1368	Uint32 pixel;
1369	RGBA_FROM_8888(*src, sfmt, r, g, b, a);
1370	PIXEL_FROM_RGB(pixel, dfmt, r, g, b);
1371	*d++ = pixel | a << 24;
1372	src++;
1373    }
1374    return n * 4;
1375}
1376
1377/* decode 32bpp rgba into 32bpp rgba, keeping alpha (dual purpose) */
1378static int uncopy_32(Uint32 *dst, void *src, int n,
1379		     RLEDestFormat *sfmt, SDL_PixelFormat *dfmt)
1380{
1381    int i;
1382    Uint32 *s = src;
1383    for(i = 0; i < n; i++) {
1384	unsigned r, g, b, a;
1385	Uint32 pixel = *s++;
1386	RGB_FROM_PIXEL(pixel, sfmt, r, g, b);
1387	a = pixel >> 24;
1388	PIXEL_FROM_RGBA(*dst, dfmt, r, g, b, a);
1389	dst++;
1390    }
1391    return n * 4;
1392}
1393
1394#define ISOPAQUE(pixel, fmt) ((((pixel) & fmt->Amask) >> fmt->Ashift) == 255)
1395
1396#define ISTRANSL(pixel, fmt)	\
1397    ((unsigned)((((pixel) & fmt->Amask) >> fmt->Ashift) - 1U) < 254U)
1398
1399/* convert surface to be quickly alpha-blittable onto dest, if possible */
1400static int RLEAlphaSurface(SDL_Surface *surface)
1401{
1402    SDL_Surface *dest;
1403    SDL_PixelFormat *df;
1404    int maxsize = 0;
1405    int max_opaque_run;
1406    int max_transl_run = 65535;
1407    unsigned masksum;
1408    Uint8 *rlebuf, *dst;
1409    int (*copy_opaque)(void *, Uint32 *, int,
1410		       SDL_PixelFormat *, SDL_PixelFormat *);
1411    int (*copy_transl)(void *, Uint32 *, int,
1412		       SDL_PixelFormat *, SDL_PixelFormat *);
1413
1414    dest = surface->map->dst;
1415    if(!dest)
1416	return -1;
1417    df = dest->format;
1418    if(surface->format->BitsPerPixel != 32)
1419	return -1;		/* only 32bpp source supported */
1420
1421    /* find out whether the destination is one we support,
1422       and determine the max size of the encoded result */
1423    masksum = df->Rmask | df->Gmask | df->Bmask;
1424    switch(df->BytesPerPixel) {
1425    case 2:
1426	/* 16bpp: only support 565 and 555 formats */
1427	switch(masksum) {
1428	case 0xffff:
1429	    if(df->Gmask == 0x07e0
1430	       || df->Rmask == 0x07e0 || df->Bmask == 0x07e0) {
1431		copy_opaque = copy_opaque_16;
1432		copy_transl = copy_transl_565;
1433	    } else
1434		return -1;
1435	    break;
1436	case 0x7fff:
1437	    if(df->Gmask == 0x03e0
1438	       || df->Rmask == 0x03e0 || df->Bmask == 0x03e0) {
1439		copy_opaque = copy_opaque_16;
1440		copy_transl = copy_transl_555;
1441	    } else
1442		return -1;
1443	    break;
1444	default:
1445	    return -1;
1446	}
1447	max_opaque_run = 255;	/* runs stored as bytes */
1448
1449	/* worst case is alternating opaque and translucent pixels,
1450	   with room for alignment padding between lines */
1451	maxsize = surface->h * (2 + (4 + 2) * (surface->w + 1)) + 2;
1452	break;
1453    case 4:
1454	if(masksum != 0x00ffffff)
1455	    return -1;		/* requires unused high byte */
1456	copy_opaque = copy_32;
1457	copy_transl = copy_32;
1458	max_opaque_run = 255;	/* runs stored as short ints */
1459
1460	/* worst case is alternating opaque and translucent pixels */
1461	maxsize = surface->h * 2 * 4 * (surface->w + 1) + 4;
1462	break;
1463    default:
1464	return -1;		/* anything else unsupported right now */
1465    }
1466
1467    maxsize += sizeof(RLEDestFormat);
1468    rlebuf = (Uint8 *)SDL_malloc(maxsize);
1469    if(!rlebuf) {
1470	SDL_OutOfMemory();
1471	return -1;
1472    }
1473    {
1474	/* save the destination format so we can undo the encoding later */
1475	RLEDestFormat *r = (RLEDestFormat *)rlebuf;
1476	r->BytesPerPixel = df->BytesPerPixel;
1477	r->Rloss = df->Rloss;
1478	r->Gloss = df->Gloss;
1479	r->Bloss = df->Bloss;
1480	r->Rshift = df->Rshift;
1481	r->Gshift = df->Gshift;
1482	r->Bshift = df->Bshift;
1483	r->Ashift = df->Ashift;
1484	r->Rmask = df->Rmask;
1485	r->Gmask = df->Gmask;
1486	r->Bmask = df->Bmask;
1487	r->Amask = df->Amask;
1488    }
1489    dst = rlebuf + sizeof(RLEDestFormat);
1490
1491    /* Do the actual encoding */
1492    {
1493	int x, y;
1494	int h = surface->h, w = surface->w;
1495	SDL_PixelFormat *sf = surface->format;
1496	Uint32 *src = (Uint32 *)surface->pixels;
1497	Uint8 *lastline = dst;	/* end of last non-blank line */
1498
1499	/* opaque counts are 8 or 16 bits, depending on target depth */
1500#define ADD_OPAQUE_COUNTS(n, m)			\
1501	if(df->BytesPerPixel == 4) {		\
1502	    ((Uint16 *)dst)[0] = n;		\
1503	    ((Uint16 *)dst)[1] = m;		\
1504	    dst += 4;				\
1505	} else {				\
1506	    dst[0] = n;				\
1507	    dst[1] = m;				\
1508	    dst += 2;				\
1509	}
1510
1511	/* translucent counts are always 16 bit */
1512#define ADD_TRANSL_COUNTS(n, m)		\
1513	(((Uint16 *)dst)[0] = n, ((Uint16 *)dst)[1] = m, dst += 4)
1514
1515	for(y = 0; y < h; y++) {
1516	    int runstart, skipstart;
1517	    int blankline = 0;
1518	    /* First encode all opaque pixels of a scan line */
1519	    x = 0;
1520	    do {
1521		int run, skip, len;
1522		skipstart = x;
1523		while(x < w && !ISOPAQUE(src[x], sf))
1524		    x++;
1525		runstart = x;
1526		while(x < w && ISOPAQUE(src[x], sf))
1527		    x++;
1528		skip = runstart - skipstart;
1529		if(skip == w)
1530		    blankline = 1;
1531		run = x - runstart;
1532		while(skip > max_opaque_run) {
1533		    ADD_OPAQUE_COUNTS(max_opaque_run, 0);
1534		    skip -= max_opaque_run;
1535		}
1536		len = MIN(run, max_opaque_run);
1537		ADD_OPAQUE_COUNTS(skip, len);
1538		dst += copy_opaque(dst, src + runstart, len, sf, df);
1539		runstart += len;
1540		run -= len;
1541		while(run) {
1542		    len = MIN(run, max_opaque_run);
1543		    ADD_OPAQUE_COUNTS(0, len);
1544		    dst += copy_opaque(dst, src + runstart, len, sf, df);
1545		    runstart += len;
1546		    run -= len;
1547		}
1548	    } while(x < w);
1549
1550	    /* Make sure the next output address is 32-bit aligned */
1551	    dst += (uintptr_t)dst & 2;
1552
1553	    /* Next, encode all translucent pixels of the same scan line */
1554	    x = 0;
1555	    do {
1556		int run, skip, len;
1557		skipstart = x;
1558		while(x < w && !ISTRANSL(src[x], sf))
1559		    x++;
1560		runstart = x;
1561		while(x < w && ISTRANSL(src[x], sf))
1562		    x++;
1563		skip = runstart - skipstart;
1564		blankline &= (skip == w);
1565		run = x - runstart;
1566		while(skip > max_transl_run) {
1567		    ADD_TRANSL_COUNTS(max_transl_run, 0);
1568		    skip -= max_transl_run;
1569		}
1570		len = MIN(run, max_transl_run);
1571		ADD_TRANSL_COUNTS(skip, len);
1572		dst += copy_transl(dst, src + runstart, len, sf, df);
1573		runstart += len;
1574		run -= len;
1575		while(run) {
1576		    len = MIN(run, max_transl_run);
1577		    ADD_TRANSL_COUNTS(0, len);
1578		    dst += copy_transl(dst, src + runstart, len, sf, df);
1579		    runstart += len;
1580		    run -= len;
1581		}
1582		if(!blankline)
1583		    lastline = dst;
1584	    } while(x < w);
1585
1586	    src += surface->pitch >> 2;
1587	}
1588	dst = lastline;		/* back up past trailing blank lines */
1589	ADD_OPAQUE_COUNTS(0, 0);
1590    }
1591
1592#undef ADD_OPAQUE_COUNTS
1593#undef ADD_TRANSL_COUNTS
1594
1595    /* Now that we have it encoded, release the original pixels */
1596    if((surface->flags & SDL_PREALLOC) != SDL_PREALLOC
1597       && (surface->flags & SDL_HWSURFACE) != SDL_HWSURFACE) {
1598	SDL_free( surface->pixels );
1599	surface->pixels = NULL;
1600    }
1601
1602    /* realloc the buffer to release unused memory */
1603    {
1604	Uint8 *p = SDL_realloc(rlebuf, dst - rlebuf);
1605	if(!p)
1606	    p = rlebuf;
1607	surface->map->sw_data->aux_data = p;
1608    }
1609
1610    return 0;
1611}
1612
1613static Uint32 getpix_8(Uint8 *srcbuf)
1614{
1615    return *srcbuf;
1616}
1617
1618static Uint32 getpix_16(Uint8 *srcbuf)
1619{
1620    return *(Uint16 *)srcbuf;
1621}
1622
1623static Uint32 getpix_24(Uint8 *srcbuf)
1624{
1625#if SDL_BYTEORDER == SDL_LIL_ENDIAN
1626    return srcbuf[0] + (srcbuf[1] << 8) + (srcbuf[2] << 16);
1627#else
1628    return (srcbuf[0] << 16) + (srcbuf[1] << 8) + srcbuf[2];
1629#endif
1630}
1631
1632static Uint32 getpix_32(Uint8 *srcbuf)
1633{
1634    return *(Uint32 *)srcbuf;
1635}
1636
1637typedef Uint32 (*getpix_func)(Uint8 *);
1638
1639static getpix_func getpixes[4] = {
1640    getpix_8, getpix_16, getpix_24, getpix_32
1641};
1642
1643static int RLEColorkeySurface(SDL_Surface *surface)
1644{
1645        Uint8 *rlebuf, *dst;
1646	int maxn;
1647	int y;
1648	Uint8 *srcbuf, *lastline;
1649	int maxsize = 0;
1650	int bpp = surface->format->BytesPerPixel;
1651	getpix_func getpix;
1652	Uint32 ckey, rgbmask;
1653	int w, h;
1654
1655	/* calculate the worst case size for the compressed surface */
1656	switch(bpp) {
1657	case 1:
1658	    /* worst case is alternating opaque and transparent pixels,
1659	       starting with an opaque pixel */
1660	    maxsize = surface->h * 3 * (surface->w / 2 + 1) + 2;
1661	    break;
1662	case 2:
1663	case 3:
1664	    /* worst case is solid runs, at most 255 pixels wide */
1665	    maxsize = surface->h * (2 * (surface->w / 255 + 1)
1666				    + surface->w * bpp) + 2;
1667	    break;
1668	case 4:
1669	    /* worst case is solid runs, at most 65535 pixels wide */
1670	    maxsize = surface->h * (4 * (surface->w / 65535 + 1)
1671				    + surface->w * 4) + 4;
1672	    break;
1673	}
1674
1675	rlebuf = (Uint8 *)SDL_malloc(maxsize);
1676	if ( rlebuf == NULL ) {
1677		SDL_OutOfMemory();
1678		return(-1);
1679	}
1680
1681	/* Set up the conversion */
1682	srcbuf = (Uint8 *)surface->pixels;
1683	maxn = bpp == 4 ? 65535 : 255;
1684	dst = rlebuf;
1685	rgbmask = ~surface->format->Amask;
1686	ckey = surface->format->colorkey & rgbmask;
1687	lastline = dst;
1688	getpix = getpixes[bpp - 1];
1689	w = surface->w;
1690	h = surface->h;
1691
1692#define ADD_COUNTS(n, m)			\
1693	if(bpp == 4) {				\
1694	    ((Uint16 *)dst)[0] = n;		\
1695	    ((Uint16 *)dst)[1] = m;		\
1696	    dst += 4;				\
1697	} else {				\
1698	    dst[0] = n;				\
1699	    dst[1] = m;				\
1700	    dst += 2;				\
1701	}
1702
1703	for(y = 0; y < h; y++) {
1704	    int x = 0;
1705	    int blankline = 0;
1706	    do {
1707		int run, skip, len;
1708		int runstart;
1709		int skipstart = x;
1710
1711		/* find run of transparent, then opaque pixels */
1712		while(x < w && (getpix(srcbuf + x * bpp) & rgbmask) == ckey)
1713		    x++;
1714		runstart = x;
1715		while(x < w && (getpix(srcbuf + x * bpp) & rgbmask) != ckey)
1716		    x++;
1717		skip = runstart - skipstart;
1718		if(skip == w)
1719		    blankline = 1;
1720		run = x - runstart;
1721
1722		/* encode segment */
1723		while(skip > maxn) {
1724		    ADD_COUNTS(maxn, 0);
1725		    skip -= maxn;
1726		}
1727		len = MIN(run, maxn);
1728		ADD_COUNTS(skip, len);
1729		SDL_memcpy(dst, srcbuf + runstart * bpp, len * bpp);
1730		dst += len * bpp;
1731		run -= len;
1732		runstart += len;
1733		while(run) {
1734		    len = MIN(run, maxn);
1735		    ADD_COUNTS(0, len);
1736		    SDL_memcpy(dst, srcbuf + runstart * bpp, len * bpp);
1737		    dst += len * bpp;
1738		    runstart += len;
1739		    run -= len;
1740		}
1741		if(!blankline)
1742		    lastline = dst;
1743	    } while(x < w);
1744
1745	    srcbuf += surface->pitch;
1746	}
1747	dst = lastline;		/* back up bast trailing blank lines */
1748	ADD_COUNTS(0, 0);
1749
1750#undef ADD_COUNTS
1751
1752	/* Now that we have it encoded, release the original pixels */
1753	if((surface->flags & SDL_PREALLOC) != SDL_PREALLOC
1754	   && (surface->flags & SDL_HWSURFACE) != SDL_HWSURFACE) {
1755	    SDL_free( surface->pixels );
1756	    surface->pixels = NULL;
1757	}
1758
1759	/* realloc the buffer to release unused memory */
1760	{
1761	    /* If realloc returns NULL, the original block is left intact */
1762	    Uint8 *p = SDL_realloc(rlebuf, dst - rlebuf);
1763	    if(!p)
1764		p = rlebuf;
1765	    surface->map->sw_data->aux_data = p;
1766	}
1767
1768	return(0);
1769}
1770
1771int SDL_RLESurface(SDL_Surface *surface)
1772{
1773	int retcode;
1774
1775	/* Clear any previous RLE conversion */
1776	if ( (surface->flags & SDL_RLEACCEL) == SDL_RLEACCEL ) {
1777		SDL_UnRLESurface(surface, 1);
1778	}
1779
1780	/* We don't support RLE encoding of bitmaps */
1781	if ( surface->format->BitsPerPixel < 8 ) {
1782		return(-1);
1783	}
1784
1785	/* Lock the surface if it's in hardware */
1786	if ( SDL_MUSTLOCK(surface) ) {
1787		if ( SDL_LockSurface(surface) < 0 ) {
1788			return(-1);
1789		}
1790	}
1791
1792	/* Encode */
1793	if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
1794	    retcode = RLEColorkeySurface(surface);
1795	} else {
1796	    if((surface->flags & SDL_SRCALPHA) == SDL_SRCALPHA
1797	       && surface->format->Amask != 0)
1798		retcode = RLEAlphaSurface(surface);
1799	    else
1800		retcode = -1;	/* no RLE for per-surface alpha sans ckey */
1801	}
1802
1803	/* Unlock the surface if it's in hardware */
1804	if ( SDL_MUSTLOCK(surface) ) {
1805		SDL_UnlockSurface(surface);
1806	}
1807
1808	if(retcode < 0)
1809	    return -1;
1810
1811	/* The surface is now accelerated */
1812	surface->flags |= SDL_RLEACCEL;
1813
1814	return(0);
1815}
1816
1817/*
1818 * Un-RLE a surface with pixel alpha
1819 * This may not give back exactly the image before RLE-encoding; all
1820 * completely transparent pixels will be lost, and colour and alpha depth
1821 * may have been reduced (when encoding for 16bpp targets).
1822 */
1823static SDL_bool UnRLEAlpha(SDL_Surface *surface)
1824{
1825    Uint8 *srcbuf;
1826    Uint32 *dst;
1827    SDL_PixelFormat *sf = surface->format;
1828    RLEDestFormat *df = surface->map->sw_data->aux_data;
1829    int (*uncopy_opaque)(Uint32 *, void *, int,
1830			 RLEDestFormat *, SDL_PixelFormat *);
1831    int (*uncopy_transl)(Uint32 *, void *, int,
1832			 RLEDestFormat *, SDL_PixelFormat *);
1833    int w = surface->w;
1834    int bpp = df->BytesPerPixel;
1835
1836    if(bpp == 2) {
1837	uncopy_opaque = uncopy_opaque_16;
1838	uncopy_transl = uncopy_transl_16;
1839    } else {
1840	uncopy_opaque = uncopy_transl = uncopy_32;
1841    }
1842
1843    surface->pixels = SDL_malloc(surface->h * surface->pitch);
1844    if ( !surface->pixels ) {
1845        return(SDL_FALSE);
1846    }
1847    /* fill background with transparent pixels */
1848    SDL_memset(surface->pixels, 0, surface->h * surface->pitch);
1849
1850    dst = surface->pixels;
1851    srcbuf = (Uint8 *)(df + 1);
1852    for(;;) {
1853	/* copy opaque pixels */
1854	int ofs = 0;
1855	do {
1856	    unsigned run;
1857	    if(bpp == 2) {
1858		ofs += srcbuf[0];
1859		run = srcbuf[1];
1860		srcbuf += 2;
1861	    } else {
1862		ofs += ((Uint16 *)srcbuf)[0];
1863		run = ((Uint16 *)srcbuf)[1];
1864		srcbuf += 4;
1865	    }
1866	    if(run) {
1867		srcbuf += uncopy_opaque(dst + ofs, srcbuf, run, df, sf);
1868		ofs += run;
1869	    } else if(!ofs)
1870		return(SDL_TRUE);
1871	} while(ofs < w);
1872
1873	/* skip padding if needed */
1874	if(bpp == 2)
1875	    srcbuf += (uintptr_t)srcbuf & 2;
1876
1877	/* copy translucent pixels */
1878	ofs = 0;
1879	do {
1880	    unsigned run;
1881	    ofs += ((Uint16 *)srcbuf)[0];
1882	    run = ((Uint16 *)srcbuf)[1];
1883	    srcbuf += 4;
1884	    if(run) {
1885		srcbuf += uncopy_transl(dst + ofs, srcbuf, run, df, sf);
1886		ofs += run;
1887	    }
1888	} while(ofs < w);
1889	dst += surface->pitch >> 2;
1890    }
1891    /* Make the compiler happy */
1892    return(SDL_TRUE);
1893}
1894
1895void SDL_UnRLESurface(SDL_Surface *surface, int recode)
1896{
1897    if ( (surface->flags & SDL_RLEACCEL) == SDL_RLEACCEL ) {
1898	surface->flags &= ~SDL_RLEACCEL;
1899
1900	if(recode && (surface->flags & SDL_PREALLOC) != SDL_PREALLOC
1901	   && (surface->flags & SDL_HWSURFACE) != SDL_HWSURFACE) {
1902	    if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
1903		SDL_Rect full;
1904		unsigned alpha_flag;
1905
1906		/* re-create the original surface */
1907		surface->pixels = SDL_malloc(surface->h * surface->pitch);
1908		if ( !surface->pixels ) {
1909			/* Oh crap... */
1910			surface->flags |= SDL_RLEACCEL;
1911			return;
1912		}
1913
1914		/* fill it with the background colour */
1915		SDL_FillRect(surface, NULL, surface->format->colorkey);
1916
1917		/* now render the encoded surface */
1918		full.x = full.y = 0;
1919		full.w = surface->w;
1920		full.h = surface->h;
1921		alpha_flag = surface->flags & SDL_SRCALPHA;
1922		surface->flags &= ~SDL_SRCALPHA; /* opaque blit */
1923		SDL_RLEBlit(surface, &full, surface, &full);
1924		surface->flags |= alpha_flag;
1925	    } else {
1926		if ( !UnRLEAlpha(surface) ) {
1927		    /* Oh crap... */
1928		    surface->flags |= SDL_RLEACCEL;
1929		    return;
1930		}
1931	    }
1932	}
1933
1934	if ( surface->map && surface->map->sw_data->aux_data ) {
1935	    SDL_free(surface->map->sw_data->aux_data);
1936	    surface->map->sw_data->aux_data = NULL;
1937	}
1938    }
1939}
1940
1941
1942