Surface.cpp revision 19bac1e08be200c31efd26f0f5fd144c9b3eefd3
1// SwiftShader Software Renderer
2//
3// Copyright(c) 2005-2012 TransGaming Inc.
4//
5// All rights reserved. No part of this software may be copied, distributed, transmitted,
6// transcribed, stored in a retrieval system, translated into any human or computer
7// language by any means, or disclosed to third parties without the explicit written
8// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
9// or implied, including but not limited to any patent rights, are granted to you.
10//
11
12#include "Surface.hpp"
13
14#include "Color.hpp"
15#include "Context.hpp"
16#include "Renderer.hpp"
17#include "Common/Half.hpp"
18#include "Common/Memory.hpp"
19#include "Common/CPUID.hpp"
20#include "Common/Resource.hpp"
21#include "Common/Debug.hpp"
22#include "Reactor/Reactor.hpp"
23
24#include <xmmintrin.h>
25#include <emmintrin.h>
26
27#undef min
28#undef max
29
30namespace sw
31{
32	extern bool quadLayoutEnabled;
33	extern bool complementaryDepthBuffer;
34	extern TranscendentalPrecision logPrecision;
35
36	unsigned int *Surface::palette = 0;
37	unsigned int Surface::paletteID = 0;
38
39	void Rect::clip(int minX, int minY, int maxX, int maxY)
40	{
41		x0 = sw::clamp(x0, minX, maxX);
42		y0 = sw::clamp(y0, minY, maxY);
43		x1 = sw::clamp(x1, minX, maxX);
44		y1 = sw::clamp(y1, minY, maxY);
45	}
46
47	void Surface::Buffer::write(int x, int y, int z, const Color<float> &color)
48	{
49		void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
50
51		write(element, color);
52	}
53
54	void Surface::Buffer::write(int x, int y, const Color<float> &color)
55	{
56		void *element = (unsigned char*)buffer + x * bytes + y * pitchB;
57
58		write(element, color);
59	}
60
61	inline void Surface::Buffer::write(void *element, const Color<float> &color)
62	{
63		switch(format)
64		{
65		case FORMAT_A8:
66			*(unsigned char*)element = unorm<8>(color.a);
67			break;
68		case FORMAT_R8:
69			*(unsigned char*)element = unorm<8>(color.r);
70			break;
71		case FORMAT_R3G3B2:
72			*(unsigned char*)element = (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
73			break;
74		case FORMAT_A8R3G3B2:
75			*(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
76			break;
77		case FORMAT_X4R4G4B4:
78			*(unsigned short*)element = 0xF000 | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
79			break;
80		case FORMAT_A4R4G4B4:
81			*(unsigned short*)element = (unorm<4>(color.a) << 12) | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
82			break;
83		case FORMAT_R5G6B5:
84			*(unsigned short*)element = (unorm<5>(color.r) << 11) | (unorm<6>(color.g) << 5) | (unorm<5>(color.b) << 0);
85			break;
86		case FORMAT_A1R5G5B5:
87			*(unsigned short*)element = (unorm<1>(color.a) << 15) | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
88			break;
89		case FORMAT_X1R5G5B5:
90			*(unsigned short*)element = 0x8000 | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
91			break;
92		case FORMAT_A8R8G8B8:
93			*(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
94			break;
95		case FORMAT_X8R8G8B8:
96			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
97			break;
98		case FORMAT_A8B8G8R8:
99			*(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
100			break;
101		case FORMAT_X8B8G8R8:
102			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
103			break;
104		case FORMAT_A2R10G10B10:
105			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.r) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.b) << 0);
106			break;
107		case FORMAT_A2B10G10R10:
108			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.b) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.r) << 0);
109			break;
110		case FORMAT_G8R8:
111			*(unsigned int*)element = (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
112			break;
113		case FORMAT_G16R16:
114			*(unsigned int*)element = (unorm<16>(color.g) << 16) | (unorm<16>(color.r) << 0);
115			break;
116		case FORMAT_A16B16G16R16:
117			((unsigned short*)element)[0] = unorm<16>(color.r);
118			((unsigned short*)element)[1] = unorm<16>(color.g);
119			((unsigned short*)element)[2] = unorm<16>(color.b);
120			((unsigned short*)element)[3] = unorm<16>(color.a);
121			break;
122		case FORMAT_V8U8:
123			*(unsigned short*)element = (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
124			break;
125		case FORMAT_L6V5U5:
126			*(unsigned short*)element = (unorm<6>(color.b) << 10) | (snorm<5>(color.g) << 5) | (snorm<5>(color.r) << 0);
127			break;
128		case FORMAT_Q8W8V8U8:
129			*(unsigned int*)element = (snorm<8>(color.a) << 24) | (snorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
130			break;
131		case FORMAT_X8L8V8U8:
132			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
133			break;
134		case FORMAT_V16U16:
135			*(unsigned int*)element = (snorm<16>(color.g) << 16) | (snorm<16>(color.r) << 0);
136			break;
137		case FORMAT_A2W10V10U10:
138			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (snorm<10>(color.b) << 20) | (snorm<10>(color.g) << 10) | (snorm<10>(color.r) << 0);
139			break;
140		case FORMAT_A16W16V16U16:
141			((unsigned short*)element)[0] = snorm<16>(color.r);
142			((unsigned short*)element)[1] = snorm<16>(color.g);
143			((unsigned short*)element)[2] = snorm<16>(color.b);
144			((unsigned short*)element)[3] = unorm<16>(color.a);
145			break;
146		case FORMAT_Q16W16V16U16:
147			((unsigned short*)element)[0] = snorm<16>(color.r);
148			((unsigned short*)element)[1] = snorm<16>(color.g);
149			((unsigned short*)element)[2] = snorm<16>(color.b);
150			((unsigned short*)element)[3] = snorm<16>(color.a);
151			break;
152		case FORMAT_R8G8B8:
153			((unsigned char*)element)[0] = unorm<8>(color.b);
154			((unsigned char*)element)[1] = unorm<8>(color.g);
155			((unsigned char*)element)[2] = unorm<8>(color.r);
156			break;
157		case FORMAT_R16F:
158			*(half*)element = (half)color.r;
159			break;
160		case FORMAT_G16R16F:
161			((half*)element)[0] = (half)color.r;
162			((half*)element)[1] = (half)color.g;
163			break;
164		case FORMAT_A16B16G16R16F:
165			((half*)element)[0] = (half)color.r;
166			((half*)element)[1] = (half)color.g;
167			((half*)element)[2] = (half)color.b;
168			((half*)element)[3] = (half)color.a;
169			break;
170		case FORMAT_R32F:
171			*(float*)element = color.r;
172			break;
173		case FORMAT_G32R32F:
174			((float*)element)[0] = color.r;
175			((float*)element)[1] = color.g;
176			break;
177		case FORMAT_A32B32G32R32F:
178			((float*)element)[0] = color.r;
179			((float*)element)[1] = color.g;
180			((float*)element)[2] = color.b;
181			((float*)element)[3] = color.a;
182			break;
183		case FORMAT_D32F:
184		case FORMAT_D32F_LOCKABLE:
185		case FORMAT_D32F_TEXTURE:
186		case FORMAT_D32F_SHADOW:
187			*((float*)element) = color.r;
188			break;
189		case FORMAT_D32F_COMPLEMENTARY:
190			*((float*)element) = 1 - color.r;
191			break;
192		case FORMAT_S8:
193			*((unsigned char*)element) = unorm<8>(color.r);
194			break;
195		case FORMAT_L8:
196			*(unsigned char*)element = unorm<8>(color.r);
197			break;
198		case FORMAT_A4L4:
199			*(unsigned char*)element = (unorm<4>(color.a) << 4) | (unorm<4>(color.r) << 0);
200			break;
201		case FORMAT_L16:
202			*(unsigned short*)element = unorm<16>(color.r);
203			break;
204		case FORMAT_A8L8:
205			*(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<8>(color.r) << 0);
206			break;
207		default:
208			ASSERT(false);
209		}
210	}
211
212	Color<float> Surface::Buffer::read(int x, int y, int z) const
213	{
214		void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
215
216		return read(element);
217	}
218
219	Color<float> Surface::Buffer::read(int x, int y) const
220	{
221		void *element = (unsigned char*)buffer + x * bytes + y * pitchB;
222
223		return read(element);
224	}
225
226	inline Color<float> Surface::Buffer::read(void *element) const
227	{
228		float r = 1;
229		float g = 1;
230		float b = 1;
231		float a = 1;
232
233		switch(format)
234		{
235		case FORMAT_P8:
236			{
237				ASSERT(palette);
238
239				unsigned int abgr = palette[*(unsigned char*)element];
240
241				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
242				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
243				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
244				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
245			}
246			break;
247		case FORMAT_A8P8:
248			{
249				ASSERT(palette);
250
251				unsigned int bgr = palette[((unsigned char*)element)[0]];
252
253				r = (bgr & 0x000000FF) * (1.0f / 0x000000FF);
254				g = (bgr & 0x0000FF00) * (1.0f / 0x0000FF00);
255				b = (bgr & 0x00FF0000) * (1.0f / 0x00FF0000);
256				a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
257			}
258			break;
259		case FORMAT_A8:
260			r = 0;
261			g = 0;
262			b = 0;
263			a = *(unsigned char*)element * (1.0f / 0xFF);
264			break;
265		case FORMAT_R8:
266			r = *(unsigned char*)element * (1.0f / 0xFF);
267			break;
268		case FORMAT_R3G3B2:
269			{
270				unsigned char rgb = *(unsigned char*)element;
271
272				r = (rgb & 0xE0) * (1.0f / 0xE0);
273				g = (rgb & 0x1C) * (1.0f / 0x1C);
274				b = (rgb & 0x03) * (1.0f / 0x03);
275			}
276			break;
277		case FORMAT_A8R3G3B2:
278			{
279				unsigned short argb = *(unsigned short*)element;
280
281				a = (argb & 0xFF00) * (1.0f / 0xFF00);
282				r = (argb & 0x00E0) * (1.0f / 0x00E0);
283				g = (argb & 0x001C) * (1.0f / 0x001C);
284				b = (argb & 0x0003) * (1.0f / 0x0003);
285			}
286			break;
287		case FORMAT_X4R4G4B4:
288			{
289				unsigned short rgb = *(unsigned short*)element;
290
291				r = (rgb & 0x0F00) * (1.0f / 0x0F00);
292				g = (rgb & 0x00F0) * (1.0f / 0x00F0);
293				b = (rgb & 0x000F) * (1.0f / 0x000F);
294			}
295			break;
296		case FORMAT_A4R4G4B4:
297			{
298				unsigned short argb = *(unsigned short*)element;
299
300				a = (argb & 0xF000) * (1.0f / 0xF000);
301				r = (argb & 0x0F00) * (1.0f / 0x0F00);
302				g = (argb & 0x00F0) * (1.0f / 0x00F0);
303				b = (argb & 0x000F) * (1.0f / 0x000F);
304			}
305			break;
306		case FORMAT_R5G6B5:
307			{
308				unsigned short rgb = *(unsigned short*)element;
309
310				r = (rgb & 0xF800) * (1.0f / 0xF800);
311				g = (rgb & 0x07E0) * (1.0f / 0x07E0);
312				b = (rgb & 0x001F) * (1.0f / 0x001F);
313			}
314			break;
315		case FORMAT_A1R5G5B5:
316			{
317				unsigned short argb = *(unsigned short*)element;
318
319				a = (argb & 0x8000) * (1.0f / 0x8000);
320				r = (argb & 0x7C00) * (1.0f / 0x7C00);
321				g = (argb & 0x03E0) * (1.0f / 0x03E0);
322				b = (argb & 0x001F) * (1.0f / 0x001F);
323			}
324			break;
325		case FORMAT_X1R5G5B5:
326			{
327				unsigned short xrgb = *(unsigned short*)element;
328
329				r = (xrgb & 0x7C00) * (1.0f / 0x7C00);
330				g = (xrgb & 0x03E0) * (1.0f / 0x03E0);
331				b = (xrgb & 0x001F) * (1.0f / 0x001F);
332			}
333			break;
334		case FORMAT_A8R8G8B8:
335			{
336				unsigned int argb = *(unsigned int*)element;
337
338				a = (argb & 0xFF000000) * (1.0f / 0xFF000000);
339				r = (argb & 0x00FF0000) * (1.0f / 0x00FF0000);
340				g = (argb & 0x0000FF00) * (1.0f / 0x0000FF00);
341				b = (argb & 0x000000FF) * (1.0f / 0x000000FF);
342			}
343			break;
344		case FORMAT_X8R8G8B8:
345			{
346				unsigned int xrgb = *(unsigned int*)element;
347
348				r = (xrgb & 0x00FF0000) * (1.0f / 0x00FF0000);
349				g = (xrgb & 0x0000FF00) * (1.0f / 0x0000FF00);
350				b = (xrgb & 0x000000FF) * (1.0f / 0x000000FF);
351			}
352			break;
353		case FORMAT_A8B8G8R8:
354			{
355				unsigned int abgr = *(unsigned int*)element;
356
357				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
358				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
359				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
360				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
361			}
362			break;
363		case FORMAT_X8B8G8R8:
364			{
365				unsigned int xbgr = *(unsigned int*)element;
366
367				b = (xbgr & 0x00FF0000) * (1.0f / 0x00FF0000);
368				g = (xbgr & 0x0000FF00) * (1.0f / 0x0000FF00);
369				r = (xbgr & 0x000000FF) * (1.0f / 0x000000FF);
370			}
371			break;
372		case FORMAT_G8R8:
373			{
374				unsigned short gr = *(unsigned short*)element;
375
376				g = (gr & 0xFF00) * (1.0f / 0xFF00);
377				r = (gr & 0x00FF) * (1.0f / 0x00FF);
378			}
379			break;
380		case FORMAT_G16R16:
381			{
382				unsigned int gr = *(unsigned int*)element;
383
384				g = (gr & 0xFFFF0000) * (1.0f / 0xFFFF0000);
385				r = (gr & 0x0000FFFF) * (1.0f / 0x0000FFFF);
386			}
387			break;
388		case FORMAT_A2R10G10B10:
389			{
390				unsigned int argb = *(unsigned int*)element;
391
392				a = (argb & 0xC0000000) * (1.0f / 0xC0000000);
393				r = (argb & 0x3FF00000) * (1.0f / 0x3FF00000);
394				g = (argb & 0x000FFC00) * (1.0f / 0x000FFC00);
395				b = (argb & 0x000003FF) * (1.0f / 0x000003FF);
396			}
397			break;
398		case FORMAT_A2B10G10R10:
399			{
400				unsigned int abgr = *(unsigned int*)element;
401
402				a = (abgr & 0xC0000000) * (1.0f / 0xC0000000);
403				b = (abgr & 0x3FF00000) * (1.0f / 0x3FF00000);
404				g = (abgr & 0x000FFC00) * (1.0f / 0x000FFC00);
405				r = (abgr & 0x000003FF) * (1.0f / 0x000003FF);
406			}
407			break;
408		case FORMAT_A16B16G16R16:
409			r = ((unsigned short*)element)[0] * (1.0f / 0xFFFF);
410			g = ((unsigned short*)element)[1] * (1.0f / 0xFFFF);
411			b = ((unsigned short*)element)[2] * (1.0f / 0xFFFF);
412			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
413			break;
414		case FORMAT_V8U8:
415			{
416				unsigned short vu = *(unsigned short*)element;
417
418				r = ((int)(vu & 0x00FF) << 24) * (1.0f / 0x7F000000);
419				g = ((int)(vu & 0xFF00) << 16) * (1.0f / 0x7F000000);
420			}
421			break;
422		case FORMAT_L6V5U5:
423			{
424				unsigned short lvu = *(unsigned short*)element;
425
426				r = ((int)(lvu & 0x001F) << 27) * (1.0f / 0x78000000);
427				g = ((int)(lvu & 0x03E0) << 22) * (1.0f / 0x78000000);
428				b = (lvu & 0xFC00) * (1.0f / 0xFC00);
429			}
430			break;
431		case FORMAT_Q8W8V8U8:
432			{
433				unsigned int qwvu = *(unsigned int*)element;
434
435				r = ((int)(qwvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
436				g = ((int)(qwvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
437				b = ((int)(qwvu & 0x00FF0000) << 8)  * (1.0f / 0x7F000000);
438				a = ((int)(qwvu & 0xFF000000) << 0)  * (1.0f / 0x7F000000);
439			}
440			break;
441		case FORMAT_X8L8V8U8:
442			{
443				unsigned int xlvu = *(unsigned int*)element;
444
445				r = ((int)(xlvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
446				g = ((int)(xlvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
447				b = (xlvu & 0x00FF0000) * (1.0f / 0x00FF0000);
448			}
449			break;
450		case FORMAT_R8G8B8:
451			r = ((unsigned char*)element)[2] * (1.0f / 0xFF);
452			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
453			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
454			break;
455		case FORMAT_V16U16:
456			{
457				unsigned int vu = *(unsigned int*)element;
458
459				r = ((int)(vu & 0x0000FFFF) << 16) * (1.0f / 0x7FFF0000);
460				g = ((int)(vu & 0xFFFF0000) << 0)  * (1.0f / 0x7FFF0000);
461			}
462			break;
463		case FORMAT_A2W10V10U10:
464			{
465				unsigned int awvu = *(unsigned int*)element;
466
467				r = ((int)(awvu & 0x000003FF) << 22) * (1.0f / 0x7FC00000);
468				g = ((int)(awvu & 0x000FFC00) << 12) * (1.0f / 0x7FC00000);
469				b = ((int)(awvu & 0x3FF00000) << 2)  * (1.0f / 0x7FC00000);
470				a = (awvu & 0xC0000000) * (1.0f / 0xC0000000);
471			}
472			break;
473		case FORMAT_A16W16V16U16:
474			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
475			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
476			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
477			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
478			break;
479		case FORMAT_Q16W16V16U16:
480			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
481			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
482			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
483			a = ((signed short*)element)[3] * (1.0f / 0x7FFF);
484			break;
485		case FORMAT_L8:
486			r =
487			g =
488			b = *(unsigned char*)element * (1.0f / 0xFF);
489			break;
490		case FORMAT_A4L4:
491			{
492				unsigned char al = *(unsigned char*)element;
493
494				r =
495				g =
496				b = (al & 0x0F) * (1.0f / 0x0F);
497				a = (al & 0xF0) * (1.0f / 0xF0);
498			}
499			break;
500		case FORMAT_L16:
501			r =
502			g =
503			b = *(unsigned short*)element * (1.0f / 0xFFFF);
504			break;
505		case FORMAT_A8L8:
506			r =
507			g =
508			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
509			a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
510			break;
511		case FORMAT_R16F:
512			r = *(half*)element;
513			break;
514		case FORMAT_G16R16F:
515			r = ((half*)element)[0];
516			g = ((half*)element)[1];
517			break;
518		case FORMAT_A16B16G16R16F:
519			r = ((half*)element)[0];
520			g = ((half*)element)[1];
521			b = ((half*)element)[2];
522			a = ((half*)element)[3];
523			break;
524		case FORMAT_R32F:
525			r = *(float*)element;
526			break;
527		case FORMAT_G32R32F:
528			r = ((float*)element)[0];
529			g = ((float*)element)[1];
530			break;
531		case FORMAT_A32B32G32R32F:
532			r = ((float*)element)[0];
533			g = ((float*)element)[1];
534			b = ((float*)element)[2];
535			a = ((float*)element)[3];
536			break;
537		case FORMAT_D32F:
538		case FORMAT_D32F_LOCKABLE:
539		case FORMAT_D32F_TEXTURE:
540		case FORMAT_D32F_SHADOW:
541			r = *(float*)element;
542			g = r;
543			b = r;
544			a = r;
545			break;
546		case FORMAT_D32F_COMPLEMENTARY:
547			r = 1 - *(float*)element;
548			g = r;
549			b = r;
550			a = r;
551			break;
552		case FORMAT_S8:
553			r = *(unsigned char*)element * (1.0f / 0xFF);
554			break;
555		default:
556			ASSERT(false);
557		}
558
559	//	if(sRGB)
560	//	{
561	//		r = sRGBtoLinear(r);
562	//		g = sRGBtoLinear(g);
563	//		b = sRGBtoLinear(b);
564	//	}
565
566		return Color<float>(r, g, b, a);
567	}
568
569	Color<float> Surface::Buffer::sample(float x, float y, float z) const
570	{
571		x -= 0.5f;
572		y -= 0.5f;
573		z -= 0.5f;
574
575		int x0 = clamp((int)x, 0, width - 1);
576		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
577
578		int y0 = clamp((int)y, 0, height - 1);
579		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
580
581		int z0 = clamp((int)z, 0, depth - 1);
582		int z1 = (z0 + 1 >= depth) ? z0 : z0 + 1;
583
584		Color<float> c000 = read(x0, y0, z0);
585		Color<float> c100 = read(x1, y0, z0);
586		Color<float> c010 = read(x0, y1, z0);
587		Color<float> c110 = read(x1, y1, z0);
588		Color<float> c001 = read(x0, y0, z1);
589		Color<float> c101 = read(x1, y0, z1);
590		Color<float> c011 = read(x0, y1, z1);
591		Color<float> c111 = read(x1, y1, z1);
592
593		float fx = x - x0;
594		float fy = y - y0;
595		float fz = z - z0;
596
597		c000 *= (1 - fx) * (1 - fy) * (1 - fz);
598		c100 *= fx * (1 - fy) * (1 - fz);
599		c010 *= (1 - fx) * fy * (1 - fz);
600		c110 *= fx * fy * (1 - fz);
601		c001 *= (1 - fx) * (1 - fy) * fz;
602		c101 *= fx * (1 - fy) * fz;
603		c011 *= (1 - fx) * fy * fz;
604		c111 *= fx * fy * fz;
605
606		return c000 + c100 + c010 + c110 + c001 + c101 + c011 + c111;
607	}
608
609	Color<float> Surface::Buffer::sample(float x, float y) const
610	{
611		x -= 0.5f;
612		y -= 0.5f;
613
614		int x0 = clamp((int)x, 0, width - 1);
615		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
616
617		int y0 = clamp((int)y, 0, height - 1);
618		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
619
620		Color<float> c00 = read(x0, y0);
621		Color<float> c10 = read(x1, y0);
622		Color<float> c01 = read(x0, y1);
623		Color<float> c11 = read(x1, y1);
624
625		float fx = x - x0;
626		float fy = y - y0;
627
628		c00 *= (1 - fx) * (1 - fy);
629		c10 *= fx * (1 - fy);
630		c01 *= (1 - fx) * fy;
631		c11 *= fx * fy;
632
633		return c00 + c10 + c01 + c11;
634	}
635
636	void *Surface::Buffer::lockRect(int x, int y, int z, Lock lock)
637	{
638		this->lock = lock;
639
640		switch(lock)
641		{
642		case LOCK_UNLOCKED:
643		case LOCK_READONLY:
644			break;
645		case LOCK_WRITEONLY:
646		case LOCK_READWRITE:
647		case LOCK_DISCARD:
648			dirty = true;
649			break;
650		default:
651			ASSERT(false);
652		}
653
654		switch(format)
655		{
656		#if S3TC_SUPPORT
657		case FORMAT_DXT1:
658		case FORMAT_ATI1:
659			return (unsigned char*)buffer + 8 * (x / 4) + (y / 4) * pitchB + z * sliceB;
660		case FORMAT_DXT3:
661		case FORMAT_DXT5:
662		case FORMAT_ATI2:
663			return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
664		#endif
665		default:
666			return (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
667		}
668
669		return 0;
670	}
671
672	void Surface::Buffer::unlockRect()
673	{
674		lock = LOCK_UNLOCKED;
675	}
676
677	Surface::Surface(Resource *texture, int width, int height, int depth, Format format, bool lockable, bool renderTarget) : lockable(lockable), renderTarget(renderTarget)
678	{
679		resource = texture ? texture : new Resource(0);
680		hasParent = texture != 0;
681		depth = max(1, depth);
682
683		external.buffer = 0;
684		external.width = width;
685		external.height = height;
686		external.depth = depth;
687		external.format = format;
688		external.bytes = bytes(external.format);
689		external.pitchB = pitchB(external.width, external.format, renderTarget && !texture);
690		external.pitchP = pitchP(external.width, external.format, renderTarget && !texture);
691		external.sliceB = sliceB(external.width, external.height, external.format, renderTarget && !texture);
692		external.sliceP = sliceP(external.width, external.height, external.format, renderTarget && !texture);
693		external.lock = LOCK_UNLOCKED;
694		external.dirty = false;
695		external.paletteUsed = 0;
696
697		internal.buffer = 0;
698		internal.width = width;
699		internal.height = height;
700		internal.depth = depth;
701		internal.format = selectInternalFormat(format);
702		internal.bytes = bytes(internal.format);
703		internal.pitchB = pitchB(internal.width, internal.format, renderTarget);
704		internal.pitchP = pitchP(internal.width, internal.format, renderTarget);
705		internal.sliceB = sliceB(internal.width, internal.height, internal.format, renderTarget);
706		internal.sliceP = sliceP(internal.width, internal.height, internal.format, renderTarget);
707		internal.lock = LOCK_UNLOCKED;
708		internal.dirty = false;
709		internal.paletteUsed = 0;
710
711		stencil.buffer = 0;
712		stencil.width = width;
713		stencil.height = height;
714		stencil.depth = depth;
715		stencil.format = FORMAT_S8;
716		stencil.bytes = bytes(stencil.format);
717		stencil.pitchB = pitchB(stencil.width, stencil.format, renderTarget);
718		stencil.pitchP = pitchP(stencil.width, stencil.format, renderTarget);
719		stencil.sliceB = sliceB(stencil.width, stencil.height, stencil.format, renderTarget);
720		stencil.sliceP = sliceP(stencil.width, stencil.height, stencil.format, renderTarget);
721		stencil.lock = LOCK_UNLOCKED;
722		stencil.dirty = false;
723		stencil.paletteUsed = 0;
724
725		dirtyMipmaps = true;
726	}
727
728	Surface::~Surface()
729	{
730		if(!hasParent)
731		{
732			// Synchronize so we can deallocate the buffers below
733			resource->lock(DESTRUCT);
734			resource->unlock();
735			resource->destruct();
736		}
737
738		deallocate(external.buffer);
739
740		if(internal.buffer != external.buffer)
741		{
742			deallocate(internal.buffer);
743		}
744
745		deallocate(stencil.buffer);
746
747		external.buffer = 0;
748		internal.buffer = 0;
749		stencil.buffer = 0;
750	}
751
752	void *Surface::lockExternal(int x, int y, int z, Lock lock, Accessor client)
753	{
754		resource->lock(client);
755
756		if(!external.buffer)
757		{
758			if(internal.buffer && identicalFormats())
759			{
760				external.buffer = internal.buffer;
761			}
762			else
763			{
764				external.buffer = allocateBuffer(external.width, external.height, external.depth, external.format);
765			}
766		}
767
768		if(internal.dirty)
769		{
770			if(lock != LOCK_DISCARD)
771			{
772				update(external, internal);
773			}
774		}
775
776		switch(lock)
777		{
778		case LOCK_READONLY:
779			break;
780		case LOCK_WRITEONLY:
781		case LOCK_READWRITE:
782		case LOCK_DISCARD:
783			dirtyMipmaps = true;
784			break;
785		default:
786			ASSERT(false);
787		}
788
789		return external.lockRect(x, y, z, lock);
790	}
791
792	void Surface::unlockExternal()
793	{
794		resource->unlock();
795
796		external.unlockRect();
797	}
798
799	void *Surface::lockInternal(int x, int y, int z, Lock lock, Accessor client)
800	{
801		if(lock != LOCK_UNLOCKED)
802		{
803			resource->lock(client);
804		}
805
806		if(!internal.buffer)
807		{
808			if(external.buffer && identicalFormats())
809			{
810				internal.buffer = external.buffer;
811			}
812			else
813			{
814				internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.format);
815			}
816		}
817
818		// FIXME: WHQL requires conversion to lower external precision and back
819		if(logPrecision >= WHQL)
820		{
821			if(internal.dirty && renderTarget && internal.format != external.format)
822			{
823				if(lock != LOCK_DISCARD)
824				{
825					switch(external.format)
826					{
827					case FORMAT_R3G3B2:
828					case FORMAT_A8R3G3B2:
829					case FORMAT_A1R5G5B5:
830					case FORMAT_A2R10G10B10:
831					case FORMAT_A2B10G10R10:
832						lockExternal(0, 0, 0, LOCK_READWRITE, client);
833						unlockExternal();
834						break;
835					default:
836						// Difference passes WHQL
837						break;
838					}
839				}
840			}
841		}
842
843		if(external.dirty)
844		{
845			if(lock != LOCK_DISCARD)
846			{
847				update(internal, external);
848			}
849		}
850
851		if(isPalette(external.format) && internal.paletteUsed != Surface::paletteID)
852		{
853			update(internal, external);
854		}
855
856		switch(lock)
857		{
858		case LOCK_UNLOCKED:
859		case LOCK_READONLY:
860			break;
861		case LOCK_WRITEONLY:
862		case LOCK_READWRITE:
863		case LOCK_DISCARD:
864			dirtyMipmaps = true;
865			break;
866		default:
867			ASSERT(false);
868		}
869
870		if(lock == LOCK_READONLY && client == PUBLIC)
871		{
872			resolve();
873		}
874
875		return internal.lockRect(x, y, z, lock);
876	}
877
878	void Surface::unlockInternal()
879	{
880		resource->unlock();
881
882		internal.unlockRect();
883	}
884
885	void *Surface::lockStencil(int front, Accessor client)
886	{
887		resource->lock(client);
888
889		if(!stencil.buffer)
890		{
891			stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.format);
892		}
893
894		if(external.dirty)
895		{
896			update(stencil, external);   // FIXME: Only when not discarding
897		}
898
899		return stencil.lockRect(0, 0, front, LOCK_READWRITE);   // FIXME
900	}
901
902	void Surface::unlockStencil()
903	{
904		resource->unlock();
905
906		stencil.unlockRect();
907	}
908
909	int Surface::bytes(Format format)
910	{
911		switch(format)
912		{
913		case FORMAT_NULL:				return 0;
914		case FORMAT_P8:					return 1;
915		case FORMAT_A8P8:				return 2;
916		case FORMAT_A8:					return 1;
917		case FORMAT_R8:					return 1;
918		case FORMAT_R3G3B2:				return 1;
919		case FORMAT_A8R3G3B2:			return 2;
920		case FORMAT_R5G6B5:				return 2;
921		case FORMAT_A1R5G5B5:			return 2;
922		case FORMAT_X1R5G5B5:			return 2;
923		case FORMAT_X4R4G4B4:			return 2;
924		case FORMAT_A4R4G4B4:			return 2;
925		case FORMAT_R8G8B8:				return 3;
926		case FORMAT_X8R8G8B8:			return 4;
927	//	case FORMAT_X8G8R8B8Q:			return 4;
928		case FORMAT_A8R8G8B8:			return 4;
929	//	case FORMAT_A8G8R8B8Q:			return 4;
930		case FORMAT_X8B8G8R8:			return 4;
931		case FORMAT_A8B8G8R8:			return 4;
932		case FORMAT_A2R10G10B10:		return 4;
933		case FORMAT_A2B10G10R10:		return 4;
934		case FORMAT_G8R8:				return 2;
935		case FORMAT_G16R16:				return 4;
936		case FORMAT_A16B16G16R16:		return 8;
937		// Compressed formats
938		#if S3TC_SUPPORT
939		case FORMAT_DXT1:				return 2;   // Column of four pixels
940		case FORMAT_DXT3:				return 4;   // Column of four pixels
941		case FORMAT_DXT5:				return 4;   // Column of four pixels
942		case FORMAT_ATI1:				return 2;   // Column of four pixels
943		case FORMAT_ATI2:				return 4;   // Column of four pixels
944		#endif
945		// Bumpmap formats
946		case FORMAT_V8U8:				return 2;
947		case FORMAT_L6V5U5:				return 2;
948		case FORMAT_Q8W8V8U8:			return 4;
949		case FORMAT_X8L8V8U8:			return 4;
950		case FORMAT_A2W10V10U10:		return 4;
951		case FORMAT_V16U16:				return 4;
952		case FORMAT_A16W16V16U16:		return 8;
953		case FORMAT_Q16W16V16U16:		return 8;
954		// Luminance formats
955		case FORMAT_L8:					return 1;
956		case FORMAT_A4L4:				return 1;
957		case FORMAT_L16:				return 2;
958		case FORMAT_A8L8:				return 2;
959		// Floating-point formats
960		case FORMAT_R16F:				return 2;
961		case FORMAT_G16R16F:			return 4;
962		case FORMAT_A16B16G16R16F:		return 8;
963		case FORMAT_R32F:				return 4;
964		case FORMAT_G32R32F:			return 8;
965		case FORMAT_A32B32G32R32F:		return 16;
966		// Depth/stencil formats
967		case FORMAT_D16:				return 2;
968		case FORMAT_D32:				return 4;
969		case FORMAT_D24X8:				return 4;
970		case FORMAT_D24S8:				return 4;
971		case FORMAT_D24FS8:				return 4;
972		case FORMAT_D32F:				return 4;
973		case FORMAT_D32F_COMPLEMENTARY:	return 4;
974		case FORMAT_D32F_LOCKABLE:		return 4;
975		case FORMAT_D32F_TEXTURE:		return 4;
976		case FORMAT_D32F_SHADOW:		return 4;
977		case FORMAT_DF24:				return 4;
978		case FORMAT_DF16:				return 2;
979		case FORMAT_INTZ:				return 4;
980		case FORMAT_S8:					return 1;
981		default:
982			ASSERT(false);
983		}
984
985		return 0;
986	}
987
988	int Surface::pitchB(int width, Format format, bool target)
989	{
990		if(target || isDepth(format) || isStencil(format))
991		{
992			width = ((width + 1) & ~1);
993		}
994
995		switch(format)
996		{
997		#if S3TC_SUPPORT
998		case FORMAT_DXT1:
999			return 8 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per 4 rows
1000		case FORMAT_DXT3:
1001		case FORMAT_DXT5:
1002			return 16 * ((width + 3) / 4);   // 128 bit per 4x4 block, computed per 4 rows
1003		case FORMAT_ATI1:
1004			return 2 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per row
1005		case FORMAT_ATI2:
1006			return 4 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per row
1007		#endif
1008		default:
1009			return bytes(format) * width;
1010		}
1011	}
1012
1013	int Surface::pitchP(int width, Format format, bool target)
1014	{
1015		int B = bytes(format);
1016
1017		return B > 0 ? pitchB(width, format, target) / B : 0;
1018	}
1019
1020	int Surface::sliceB(int width, int height, Format format, bool target)
1021	{
1022		if(target || isDepth(format) || isStencil(format))
1023		{
1024			height = ((height + 1) & ~1);
1025		}
1026
1027		switch(format)
1028		{
1029		#if S3TC_SUPPORT
1030		case FORMAT_DXT1:
1031		case FORMAT_DXT3:
1032		case FORMAT_DXT5:
1033			return pitchB(width, format, target) * ((height + 3) / 4);   // Pitch computed per 4 rows
1034		case FORMAT_ATI1:   // Pitch computed per row
1035		case FORMAT_ATI2:   // Pitch computed per row
1036		#endif
1037		default:
1038			return pitchB(width, format, target) * height;
1039		}
1040	}
1041
1042	int Surface::sliceP(int width, int height, Format format, bool target)
1043	{
1044		int B = bytes(format);
1045
1046		return B > 0 ? sliceB(width, height, format, target) / B : 0;
1047	}
1048
1049	void Surface::update(Buffer &destination, Buffer &source)
1050	{
1051	//	ASSERT(source.lock != LOCK_UNLOCKED);
1052	//	ASSERT(destination.lock != LOCK_UNLOCKED);
1053
1054		if(destination.buffer != source.buffer)
1055		{
1056			ASSERT(source.dirty && !destination.dirty);
1057
1058			switch(source.format)
1059			{
1060			case FORMAT_R8G8B8:		decodeR8G8B8(destination, source);		break;   // FIXME: Check destination format
1061			case FORMAT_X8B8G8R8:	decodeX8B8G8R8(destination, source);	break;   // FIXME: Check destination format
1062			case FORMAT_A8B8G8R8:	decodeA8B8G8R8(destination, source);	break;   // FIXME: Check destination format
1063			case FORMAT_R5G6B5:		decodeR5G6B5(destination, source);		break;   // FIXME: Check destination format
1064			case FORMAT_X1R5G5B5:	decodeX1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1065			case FORMAT_A1R5G5B5:	decodeA1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1066			case FORMAT_X4R4G4B4:	decodeX4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1067			case FORMAT_A4R4G4B4:	decodeA4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1068			case FORMAT_P8:			decodeP8(destination, source);			break;   // FIXME: Check destination format
1069			#if S3TC_SUPPORT
1070			case FORMAT_DXT1:		decodeDXT1(destination, source);		break;   // FIXME: Check destination format
1071			case FORMAT_DXT3:		decodeDXT3(destination, source);		break;   // FIXME: Check destination format
1072			case FORMAT_DXT5:		decodeDXT5(destination, source);		break;   // FIXME: Check destination format
1073			case FORMAT_ATI1:		decodeATI1(destination, source);		break;   // FIXME: Check destination format
1074			case FORMAT_ATI2:		decodeATI2(destination, source);		break;   // FIXME: Check destination format
1075			#endif
1076			default:				genericUpdate(destination, source);		break;
1077			}
1078		}
1079
1080		source.dirty = false;
1081		destination.paletteUsed = Surface::paletteID;
1082	}
1083
1084	void Surface::genericUpdate(Buffer &destination, Buffer &source)
1085	{
1086		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1087		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1088
1089		int depth = min(destination.depth, source.depth);
1090		int height = min(destination.height, source.height);
1091		int width = min(destination.width, source.width);
1092		int rowBytes = width * source.bytes;
1093
1094		for(int z = 0; z < depth; z++)
1095		{
1096			unsigned char *sourceRow = sourceSlice;
1097			unsigned char *destinationRow = destinationSlice;
1098
1099			for(int y = 0; y < height; y++)
1100			{
1101				if(source.format == destination.format)
1102				{
1103					memcpy(destinationRow, sourceRow, rowBytes);
1104				}
1105				else
1106				{
1107					unsigned char *sourceElement = sourceRow;
1108					unsigned char *destinationElement = destinationRow;
1109
1110					for(int x = 0; x < width; x++)
1111					{
1112						Color<float> color = source.read(sourceElement);
1113						destination.write(destinationElement, color);
1114
1115						sourceElement += source.bytes;
1116						destinationElement += destination.bytes;
1117					}
1118				}
1119
1120				sourceRow += source.pitchB;
1121				destinationRow += destination.pitchB;
1122			}
1123
1124			sourceSlice += source.sliceB;
1125			destinationSlice += destination.sliceB;
1126		}
1127	}
1128
1129	void Surface::decodeR8G8B8(Buffer &destination, const Buffer &source)
1130	{
1131		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1132		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1133
1134		for(int z = 0; z < destination.depth && z < source.depth; z++)
1135		{
1136			unsigned char *sourceRow = sourceSlice;
1137			unsigned char *destinationRow = destinationSlice;
1138
1139			for(int y = 0; y < destination.height && y < source.height; y++)
1140			{
1141				unsigned char *sourceElement = sourceRow;
1142				unsigned char *destinationElement = destinationRow;
1143
1144				for(int x = 0; x < destination.width && x < source.width; x++)
1145				{
1146					unsigned int b = sourceElement[0];
1147					unsigned int g = sourceElement[1];
1148					unsigned int r = sourceElement[2];
1149
1150					*(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0);
1151
1152					sourceElement += source.bytes;
1153					destinationElement += destination.bytes;
1154				}
1155
1156				sourceRow += source.pitchB;
1157				destinationRow += destination.pitchB;
1158			}
1159
1160			sourceSlice += source.sliceB;
1161			destinationSlice += destination.sliceB;
1162		}
1163	}
1164
1165	void Surface::decodeX8B8G8R8(Buffer &destination, const Buffer &source)
1166	{
1167		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1168		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1169
1170		for(int z = 0; z < destination.depth && z < source.depth; z++)
1171		{
1172			unsigned char *sourceRow = sourceSlice;
1173			unsigned char *destinationRow = destinationSlice;
1174
1175			for(int y = 0; y < destination.height && y < source.height; y++)
1176			{
1177				unsigned char *sourceElement = sourceRow;
1178				unsigned char *destinationElement = destinationRow;
1179
1180				for(int x = 0; x < destination.width && x < source.width; x++)
1181				{
1182					unsigned int r = sourceElement[0];
1183					unsigned int g = sourceElement[1];
1184					unsigned int b = sourceElement[2];
1185
1186					*(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0);
1187
1188					sourceElement += source.bytes;
1189					destinationElement += destination.bytes;
1190				}
1191
1192				sourceRow += source.pitchB;
1193				destinationRow += destination.pitchB;
1194			}
1195
1196			sourceSlice += source.sliceB;
1197			destinationSlice += destination.sliceB;
1198		}
1199	}
1200
1201	void Surface::decodeA8B8G8R8(Buffer &destination, const Buffer &source)
1202	{
1203		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1204		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1205
1206		for(int z = 0; z < destination.depth && z < source.depth; z++)
1207		{
1208			unsigned char *sourceRow = sourceSlice;
1209			unsigned char *destinationRow = destinationSlice;
1210
1211			for(int y = 0; y < destination.height && y < source.height; y++)
1212			{
1213				unsigned char *sourceElement = sourceRow;
1214				unsigned char *destinationElement = destinationRow;
1215
1216				for(int x = 0; x < destination.width && x < source.width; x++)
1217				{
1218					unsigned int r = sourceElement[0];
1219					unsigned int g = sourceElement[1];
1220					unsigned int b = sourceElement[2];
1221					unsigned int a = sourceElement[3];
1222
1223					*(unsigned int*)destinationElement = (a << 24) | (r << 16) | (g << 8) | (b << 0);
1224
1225					sourceElement += source.bytes;
1226					destinationElement += destination.bytes;
1227				}
1228
1229				sourceRow += source.pitchB;
1230				destinationRow += destination.pitchB;
1231			}
1232
1233			sourceSlice += source.sliceB;
1234			destinationSlice += destination.sliceB;
1235		}
1236	}
1237
1238	void Surface::decodeR5G6B5(Buffer &destination, const Buffer &source)
1239	{
1240		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1241		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1242
1243		for(int z = 0; z < destination.depth && z < source.depth; z++)
1244		{
1245			unsigned char *sourceRow = sourceSlice;
1246			unsigned char *destinationRow = destinationSlice;
1247
1248			for(int y = 0; y < destination.height && y < source.height; y++)
1249			{
1250				unsigned char *sourceElement = sourceRow;
1251				unsigned char *destinationElement = destinationRow;
1252
1253				for(int x = 0; x < destination.width && x < source.width; x++)
1254				{
1255					unsigned int rgb = *(unsigned short*)sourceElement;
1256
1257					unsigned int r = (((rgb & 0xF800) * 67385 + 0x800000) >> 8) & 0x00FF0000;
1258					unsigned int g = (((rgb & 0x07E0) * 8289  + 0x8000) >> 8) & 0x0000FF00;
1259					unsigned int b = (((rgb & 0x001F) * 2106  + 0x80) >> 8);
1260
1261					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
1262
1263					sourceElement += source.bytes;
1264					destinationElement += destination.bytes;
1265				}
1266
1267				sourceRow += source.pitchB;
1268				destinationRow += destination.pitchB;
1269			}
1270
1271			sourceSlice += source.sliceB;
1272			destinationSlice += destination.sliceB;
1273		}
1274	}
1275
1276	void Surface::decodeX1R5G5B5(Buffer &destination, const Buffer &source)
1277	{
1278		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1279		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1280
1281		for(int z = 0; z < destination.depth && z < source.depth; z++)
1282		{
1283			unsigned char *sourceRow = sourceSlice;
1284			unsigned char *destinationRow = destinationSlice;
1285
1286			for(int y = 0; y < destination.height && y < source.height; y++)
1287			{
1288				unsigned char *sourceElement = sourceRow;
1289				unsigned char *destinationElement = destinationRow;
1290
1291				for(int x = 0; x < destination.width && x < source.width; x++)
1292				{
1293					unsigned int xrgb = *(unsigned short*)sourceElement;
1294
1295					unsigned int r = (((xrgb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
1296					unsigned int g = (((xrgb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
1297					unsigned int b = (((xrgb & 0x001F) * 2106  + 0x80) >> 8);
1298
1299					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
1300
1301					sourceElement += source.bytes;
1302					destinationElement += destination.bytes;
1303				}
1304
1305				sourceRow += source.pitchB;
1306				destinationRow += destination.pitchB;
1307			}
1308
1309			sourceSlice += source.sliceB;
1310			destinationSlice += destination.sliceB;
1311		}
1312	}
1313
1314	void Surface::decodeA1R5G5B5(Buffer &destination, const Buffer &source)
1315	{
1316		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1317		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1318
1319		for(int z = 0; z < destination.depth && z < source.depth; z++)
1320		{
1321			unsigned char *sourceRow = sourceSlice;
1322			unsigned char *destinationRow = destinationSlice;
1323
1324			for(int y = 0; y < destination.height && y < source.height; y++)
1325			{
1326				unsigned char *sourceElement = sourceRow;
1327				unsigned char *destinationElement = destinationRow;
1328
1329				for(int x = 0; x < destination.width && x < source.width; x++)
1330				{
1331					unsigned int argb = *(unsigned short*)sourceElement;
1332
1333					unsigned int a =   (argb & 0x8000) * 130560;
1334					unsigned int r = (((argb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
1335					unsigned int g = (((argb & 0x03E0) * 16846  + 0x8000) >> 8) & 0x0000FF00;
1336					unsigned int b = (((argb & 0x001F) * 2106   + 0x80) >> 8);
1337
1338					*(unsigned int*)destinationElement = a | r | g | b;
1339
1340					sourceElement += source.bytes;
1341					destinationElement += destination.bytes;
1342				}
1343
1344				sourceRow += source.pitchB;
1345				destinationRow += destination.pitchB;
1346			}
1347
1348			sourceSlice += source.sliceB;
1349			destinationSlice += destination.sliceB;
1350		}
1351	}
1352
1353	void Surface::decodeX4R4G4B4(Buffer &destination, const Buffer &source)
1354	{
1355		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1356		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1357
1358		for(int z = 0; z < destination.depth && z < source.depth; z++)
1359		{
1360			unsigned char *sourceRow = sourceSlice;
1361			unsigned char *destinationRow = destinationSlice;
1362
1363			for(int y = 0; y < destination.height && y < source.height; y++)
1364			{
1365				unsigned char *sourceElement = sourceRow;
1366				unsigned char *destinationElement = destinationRow;
1367
1368				for(int x = 0; x < destination.width && x < source.width; x++)
1369				{
1370					unsigned int xrgb = *(unsigned short*)sourceElement;
1371
1372					unsigned int r = ((xrgb & 0x0F00) * 0x00001100) & 0x00FF0000;
1373					unsigned int g = ((xrgb & 0x00F0) * 0x00000110) & 0x0000FF00;
1374					unsigned int b =  (xrgb & 0x000F) * 0x00000011;
1375
1376					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
1377
1378					sourceElement += source.bytes;
1379					destinationElement += destination.bytes;
1380				}
1381
1382				sourceRow += source.pitchB;
1383				destinationRow += destination.pitchB;
1384			}
1385
1386			sourceSlice += source.sliceB;
1387			destinationSlice += destination.sliceB;
1388		}
1389	}
1390
1391	void Surface::decodeA4R4G4B4(Buffer &destination, const Buffer &source)
1392	{
1393		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1394		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1395
1396		for(int z = 0; z < destination.depth && z < source.depth; z++)
1397		{
1398			unsigned char *sourceRow = sourceSlice;
1399			unsigned char *destinationRow = destinationSlice;
1400
1401			for(int y = 0; y < destination.height && y < source.height; y++)
1402			{
1403				unsigned char *sourceElement = sourceRow;
1404				unsigned char *destinationElement = destinationRow;
1405
1406				for(int x = 0; x < destination.width && x < source.width; x++)
1407				{
1408					unsigned int argb = *(unsigned short*)sourceElement;
1409
1410					unsigned int a = ((argb & 0xF000) * 0x00011000) & 0xFF000000;
1411					unsigned int r = ((argb & 0x0F00) * 0x00001100) & 0x00FF0000;
1412					unsigned int g = ((argb & 0x00F0) * 0x00000110) & 0x0000FF00;
1413					unsigned int b =  (argb & 0x000F) * 0x00000011;
1414
1415					*(unsigned int*)destinationElement = a | r | g | b;
1416
1417					sourceElement += source.bytes;
1418					destinationElement += destination.bytes;
1419				}
1420
1421				sourceRow += source.pitchB;
1422				destinationRow += destination.pitchB;
1423			}
1424
1425			sourceSlice += source.sliceB;
1426			destinationSlice += destination.sliceB;
1427		}
1428	}
1429
1430	void Surface::decodeP8(Buffer &destination, const Buffer &source)
1431	{
1432		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1433		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1434
1435		for(int z = 0; z < destination.depth && z < source.depth; z++)
1436		{
1437			unsigned char *sourceRow = sourceSlice;
1438			unsigned char *destinationRow = destinationSlice;
1439
1440			for(int y = 0; y < destination.height && y < source.height; y++)
1441			{
1442				unsigned char *sourceElement = sourceRow;
1443				unsigned char *destinationElement = destinationRow;
1444
1445				for(int x = 0; x < destination.width && x < source.width; x++)
1446				{
1447					unsigned int abgr = palette[*(unsigned char*)sourceElement];
1448
1449					unsigned int r = (abgr & 0x000000FF) << 16;
1450					unsigned int g = (abgr & 0x0000FF00) << 0;
1451					unsigned int b = (abgr & 0x00FF0000) >> 16;
1452					unsigned int a = (abgr & 0xFF000000) >> 0;
1453
1454					*(unsigned int*)destinationElement = a | r | g | b;
1455
1456					sourceElement += source.bytes;
1457					destinationElement += destination.bytes;
1458				}
1459
1460				sourceRow += source.pitchB;
1461				destinationRow += destination.pitchB;
1462			}
1463
1464			sourceSlice += source.sliceB;
1465			destinationSlice += destination.sliceB;
1466		}
1467	}
1468
1469#if S3TC_SUPPORT
1470	void Surface::decodeDXT1(Buffer &internal, const Buffer &external)
1471	{
1472		unsigned int *destSlice = (unsigned int*)internal.buffer;
1473		DXT1 *source = (DXT1*)external.buffer;
1474
1475		for(int z = 0; z < external.depth; z++)
1476		{
1477			unsigned int *dest = destSlice;
1478
1479			for(int y = 0; y < external.height; y += 4)
1480			{
1481				for(int x = 0; x < external.width; x += 4)
1482				{
1483					Color<byte> c[4];
1484
1485					c[0] = source->c0;
1486					c[1] = source->c1;
1487
1488					if(source->c0 > source->c1)   // No transparency
1489					{
1490						// c2 = 2 / 3 * c0 + 1 / 3 * c1
1491						c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
1492						c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
1493						c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
1494						c[2].a = 0xFF;
1495
1496						// c3 = 1 / 3 * c0 + 2 / 3 * c1
1497						c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
1498						c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
1499						c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
1500						c[3].a = 0xFF;
1501					}
1502					else   // c3 transparent
1503					{
1504						// c2 = 1 / 2 * c0 + 1 / 2 * c1
1505						c[2].r = (byte)(((word)c[0].r + (word)c[1].r) / 2);
1506						c[2].g = (byte)(((word)c[0].g + (word)c[1].g) / 2);
1507						c[2].b = (byte)(((word)c[0].b + (word)c[1].b) / 2);
1508						c[2].a = 0xFF;
1509
1510						c[3].r = 0;
1511						c[3].g = 0;
1512						c[3].b = 0;
1513						c[3].a = 0;
1514					}
1515
1516					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
1517					{
1518						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
1519						{
1520							dest[(x + i) + (y + j) * internal.width] = c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4];
1521						}
1522					}
1523
1524					source++;
1525				}
1526			}
1527
1528			(byte*&)destSlice += internal.sliceB;
1529		}
1530	}
1531
1532	void Surface::decodeDXT3(Buffer &internal, const Buffer &external)
1533	{
1534		unsigned int *destSlice = (unsigned int*)internal.buffer;
1535		DXT3 *source = (DXT3*)external.buffer;
1536
1537		for(int z = 0; z < external.depth; z++)
1538		{
1539			unsigned int *dest = destSlice;
1540
1541			for(int y = 0; y < external.height; y += 4)
1542			{
1543				for(int x = 0; x < external.width; x += 4)
1544				{
1545					Color<byte> c[4];
1546
1547					c[0] = source->c0;
1548					c[1] = source->c1;
1549
1550					// c2 = 2 / 3 * c0 + 1 / 3 * c1
1551					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
1552					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
1553					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
1554
1555					// c3 = 1 / 3 * c0 + 2 / 3 * c1
1556					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
1557					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
1558					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
1559
1560					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
1561					{
1562						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
1563						{
1564							unsigned int a = (unsigned int)(source->a >> 4 * (i + j * 4)) & 0x0F;
1565							unsigned int color = (c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | ((a << 28) + (a << 24));
1566
1567							dest[(x + i) + (y + j) * internal.width] = color;
1568						}
1569					}
1570
1571					source++;
1572				}
1573			}
1574
1575			(byte*&)destSlice += internal.sliceB;
1576		}
1577	}
1578
1579	void Surface::decodeDXT5(Buffer &internal, const Buffer &external)
1580	{
1581		unsigned int *destSlice = (unsigned int*)internal.buffer;
1582		DXT5 *source = (DXT5*)external.buffer;
1583
1584		for(int z = 0; z < external.depth; z++)
1585		{
1586			unsigned int *dest = destSlice;
1587
1588			for(int y = 0; y < external.height; y += 4)
1589			{
1590				for(int x = 0; x < external.width; x += 4)
1591				{
1592					Color<byte> c[4];
1593
1594					c[0] = source->c0;
1595					c[1] = source->c1;
1596
1597					// c2 = 2 / 3 * c0 + 1 / 3 * c1
1598					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
1599					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
1600					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
1601
1602					// c3 = 1 / 3 * c0 + 2 / 3 * c1
1603					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
1604					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
1605					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
1606
1607					byte a[8];
1608
1609					a[0] = source->a0;
1610					a[1] = source->a1;
1611
1612					if(a[0] > a[1])
1613					{
1614						a[2] = (byte)((6 * (word)a[0] + 1 * (word)a[1] + 3) / 7);
1615						a[3] = (byte)((5 * (word)a[0] + 2 * (word)a[1] + 3) / 7);
1616						a[4] = (byte)((4 * (word)a[0] + 3 * (word)a[1] + 3) / 7);
1617						a[5] = (byte)((3 * (word)a[0] + 4 * (word)a[1] + 3) / 7);
1618						a[6] = (byte)((2 * (word)a[0] + 5 * (word)a[1] + 3) / 7);
1619						a[7] = (byte)((1 * (word)a[0] + 6 * (word)a[1] + 3) / 7);
1620					}
1621					else
1622					{
1623						a[2] = (byte)((4 * (word)a[0] + 1 * (word)a[1] + 2) / 5);
1624						a[3] = (byte)((3 * (word)a[0] + 2 * (word)a[1] + 2) / 5);
1625						a[4] = (byte)((2 * (word)a[0] + 3 * (word)a[1] + 2) / 5);
1626						a[5] = (byte)((1 * (word)a[0] + 4 * (word)a[1] + 2) / 5);
1627						a[6] = 0;
1628						a[7] = 0xFF;
1629					}
1630
1631					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
1632					{
1633						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
1634						{
1635							unsigned int alpha = (unsigned int)a[(unsigned int)(source->alut >> (16 + 3 * (i + j * 4))) % 8] << 24;
1636							unsigned int color = (c[(source->clut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | alpha;
1637
1638							dest[(x + i) + (y + j) * internal.width] = color;
1639						}
1640					}
1641
1642					source++;
1643				}
1644			}
1645
1646			(byte*&)destSlice += internal.sliceB;
1647		}
1648	}
1649
1650	void Surface::decodeATI1(Buffer &internal, const Buffer &external)
1651	{
1652		byte *destSlice = (byte*)internal.buffer;
1653		ATI1 *source = (ATI1*)external.buffer;
1654
1655		for(int z = 0; z < external.depth; z++)
1656		{
1657			byte *dest = destSlice;
1658
1659			for(int y = 0; y < external.height; y += 4)
1660			{
1661				for(int x = 0; x < external.width; x += 4)
1662				{
1663					byte r[8];
1664
1665					r[0] = source->r0;
1666					r[1] = source->r1;
1667
1668					if(r[0] > r[1])
1669					{
1670						r[2] = (byte)((6 * (word)r[0] + 1 * (word)r[1] + 3) / 7);
1671						r[3] = (byte)((5 * (word)r[0] + 2 * (word)r[1] + 3) / 7);
1672						r[4] = (byte)((4 * (word)r[0] + 3 * (word)r[1] + 3) / 7);
1673						r[5] = (byte)((3 * (word)r[0] + 4 * (word)r[1] + 3) / 7);
1674						r[6] = (byte)((2 * (word)r[0] + 5 * (word)r[1] + 3) / 7);
1675						r[7] = (byte)((1 * (word)r[0] + 6 * (word)r[1] + 3) / 7);
1676					}
1677					else
1678					{
1679						r[2] = (byte)((4 * (word)r[0] + 1 * (word)r[1] + 2) / 5);
1680						r[3] = (byte)((3 * (word)r[0] + 2 * (word)r[1] + 2) / 5);
1681						r[4] = (byte)((2 * (word)r[0] + 3 * (word)r[1] + 2) / 5);
1682						r[5] = (byte)((1 * (word)r[0] + 4 * (word)r[1] + 2) / 5);
1683						r[6] = 0;
1684						r[7] = 0xFF;
1685					}
1686
1687					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
1688					{
1689						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
1690						{
1691							dest[(x + i) + (y + j) * internal.width] = r[(unsigned int)(source->rlut >> (16 + 3 * (i + j * 4))) % 8];
1692						}
1693					}
1694
1695					source++;
1696				}
1697			}
1698
1699			destSlice += internal.sliceB;
1700		}
1701	}
1702
1703	void Surface::decodeATI2(Buffer &internal, const Buffer &external)
1704	{
1705		word *destSlice = (word*)internal.buffer;
1706		ATI2 *source = (ATI2*)external.buffer;
1707
1708		for(int z = 0; z < external.depth; z++)
1709		{
1710			word *dest = destSlice;
1711
1712			for(int y = 0; y < external.height; y += 4)
1713			{
1714				for(int x = 0; x < external.width; x += 4)
1715				{
1716					byte X[8];
1717
1718					X[0] = source->x0;
1719					X[1] = source->x1;
1720
1721					if(X[0] > X[1])
1722					{
1723						X[2] = (byte)((6 * (word)X[0] + 1 * (word)X[1] + 3) / 7);
1724						X[3] = (byte)((5 * (word)X[0] + 2 * (word)X[1] + 3) / 7);
1725						X[4] = (byte)((4 * (word)X[0] + 3 * (word)X[1] + 3) / 7);
1726						X[5] = (byte)((3 * (word)X[0] + 4 * (word)X[1] + 3) / 7);
1727						X[6] = (byte)((2 * (word)X[0] + 5 * (word)X[1] + 3) / 7);
1728						X[7] = (byte)((1 * (word)X[0] + 6 * (word)X[1] + 3) / 7);
1729					}
1730					else
1731					{
1732						X[2] = (byte)((4 * (word)X[0] + 1 * (word)X[1] + 2) / 5);
1733						X[3] = (byte)((3 * (word)X[0] + 2 * (word)X[1] + 2) / 5);
1734						X[4] = (byte)((2 * (word)X[0] + 3 * (word)X[1] + 2) / 5);
1735						X[5] = (byte)((1 * (word)X[0] + 4 * (word)X[1] + 2) / 5);
1736						X[6] = 0;
1737						X[7] = 0xFF;
1738					}
1739
1740					byte Y[8];
1741
1742					Y[0] = source->y0;
1743					Y[1] = source->y1;
1744
1745					if(Y[0] > Y[1])
1746					{
1747						Y[2] = (byte)((6 * (word)Y[0] + 1 * (word)Y[1] + 3) / 7);
1748						Y[3] = (byte)((5 * (word)Y[0] + 2 * (word)Y[1] + 3) / 7);
1749						Y[4] = (byte)((4 * (word)Y[0] + 3 * (word)Y[1] + 3) / 7);
1750						Y[5] = (byte)((3 * (word)Y[0] + 4 * (word)Y[1] + 3) / 7);
1751						Y[6] = (byte)((2 * (word)Y[0] + 5 * (word)Y[1] + 3) / 7);
1752						Y[7] = (byte)((1 * (word)Y[0] + 6 * (word)Y[1] + 3) / 7);
1753					}
1754					else
1755					{
1756						Y[2] = (byte)((4 * (word)Y[0] + 1 * (word)Y[1] + 2) / 5);
1757						Y[3] = (byte)((3 * (word)Y[0] + 2 * (word)Y[1] + 2) / 5);
1758						Y[4] = (byte)((2 * (word)Y[0] + 3 * (word)Y[1] + 2) / 5);
1759						Y[5] = (byte)((1 * (word)Y[0] + 4 * (word)Y[1] + 2) / 5);
1760						Y[6] = 0;
1761						Y[7] = 0xFF;
1762					}
1763
1764					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
1765					{
1766						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
1767						{
1768							word r = X[(unsigned int)(source->xlut >> (16 + 3 * (i + j * 4))) % 8];
1769							word g = Y[(unsigned int)(source->ylut >> (16 + 3 * (i + j * 4))) % 8];
1770
1771							dest[(x + i) + (y + j) * internal.width] = (g << 8) + r;
1772						}
1773					}
1774
1775					source++;
1776				}
1777			}
1778
1779			(byte*&)destSlice += internal.sliceB;
1780		}
1781	}
1782#endif
1783
1784	unsigned int Surface::size(int width, int height, int depth, Format format)
1785	{
1786		// Dimensions rounded up to multiples of 4, used for DXTC formats
1787		int width4 = (width + 3) & ~3;
1788		int height4 = (height + 3) & ~3;
1789
1790		switch(format)
1791		{
1792		#if S3TC_SUPPORT
1793		case FORMAT_DXT1:
1794		case FORMAT_ATI1:
1795			return width4 * height4 * depth / 2;
1796		case FORMAT_DXT3:
1797		case FORMAT_DXT5:
1798		case FORMAT_ATI2:
1799			return width4 * height4 * depth;
1800		#endif
1801		default:
1802			return bytes(format) * width * height * depth;
1803		}
1804
1805		return 0;
1806	}
1807
1808	bool Surface::isStencil(Format format)
1809	{
1810		switch(format)
1811		{
1812		case FORMAT_D32:
1813		case FORMAT_D16:
1814		case FORMAT_D24X8:
1815		case FORMAT_D32F:
1816		case FORMAT_D32F_COMPLEMENTARY:
1817		case FORMAT_D32F_LOCKABLE:
1818			return false;
1819		case FORMAT_D24S8:
1820		case FORMAT_D24FS8:
1821		case FORMAT_S8:
1822		case FORMAT_DF24:
1823		case FORMAT_DF16:
1824		case FORMAT_D32F_TEXTURE:
1825		case FORMAT_D32F_SHADOW:
1826		case FORMAT_INTZ:
1827			return true;
1828		default:
1829			return false;
1830		}
1831	}
1832
1833	bool Surface::isDepth(Format format)
1834	{
1835		switch(format)
1836		{
1837		case FORMAT_D32:
1838		case FORMAT_D16:
1839		case FORMAT_D24X8:
1840		case FORMAT_D24S8:
1841		case FORMAT_D24FS8:
1842		case FORMAT_D32F:
1843		case FORMAT_D32F_COMPLEMENTARY:
1844		case FORMAT_D32F_LOCKABLE:
1845		case FORMAT_DF24:
1846		case FORMAT_DF16:
1847		case FORMAT_D32F_TEXTURE:
1848		case FORMAT_D32F_SHADOW:
1849		case FORMAT_INTZ:
1850			return true;
1851		case FORMAT_S8:
1852			return false;
1853		default:
1854			return false;
1855		}
1856	}
1857
1858	bool Surface::isPalette(Format format)
1859	{
1860		switch(format)
1861		{
1862		case FORMAT_P8:
1863		case FORMAT_A8P8:
1864			return true;
1865		default:
1866			return false;
1867		}
1868	}
1869
1870	bool Surface::isFloatFormat(Format format)
1871	{
1872		switch(format)
1873		{
1874		case FORMAT_X8R8G8B8:
1875		case FORMAT_A8R8G8B8:
1876		case FORMAT_G8R8:
1877		case FORMAT_G16R16:
1878		case FORMAT_A16B16G16R16:
1879		case FORMAT_V8U8:
1880		case FORMAT_Q8W8V8U8:
1881		case FORMAT_X8L8V8U8:
1882		case FORMAT_V16U16:
1883		case FORMAT_A16W16V16U16:
1884		case FORMAT_Q16W16V16U16:
1885		case FORMAT_A8:
1886		case FORMAT_R8:
1887		case FORMAT_L8:
1888		case FORMAT_L16:
1889		case FORMAT_A8L8:
1890			return false;
1891		case FORMAT_R32F:
1892		case FORMAT_G32R32F:
1893		case FORMAT_A32B32G32R32F:
1894		case FORMAT_D32F:
1895		case FORMAT_D32F_COMPLEMENTARY:
1896		case FORMAT_D32F_LOCKABLE:
1897		case FORMAT_D32F_TEXTURE:
1898		case FORMAT_D32F_SHADOW:
1899			return true;
1900		default:
1901			ASSERT(false);
1902		}
1903
1904		return false;
1905	}
1906
1907	bool Surface::isUnsignedComponent(Format format, int component)
1908	{
1909		switch(format)
1910		{
1911		case FORMAT_NULL:
1912		case FORMAT_X8R8G8B8:
1913		case FORMAT_A8R8G8B8:
1914		case FORMAT_G8R8:
1915		case FORMAT_G16R16:
1916		case FORMAT_A16B16G16R16:
1917		case FORMAT_D32F:
1918		case FORMAT_D32F_COMPLEMENTARY:
1919		case FORMAT_D32F_LOCKABLE:
1920		case FORMAT_D32F_TEXTURE:
1921		case FORMAT_D32F_SHADOW:
1922		case FORMAT_A8:
1923		case FORMAT_R8:
1924		case FORMAT_L8:
1925		case FORMAT_L16:
1926		case FORMAT_A8L8:
1927			return true;
1928		case FORMAT_V8U8:
1929		case FORMAT_X8L8V8U8:
1930		case FORMAT_V16U16:
1931			if(component < 2)
1932			{
1933				return false;
1934			}
1935			else
1936			{
1937				return true;
1938			}
1939		case FORMAT_A16W16V16U16:
1940			if(component < 3)
1941			{
1942				return false;
1943			}
1944			else
1945			{
1946				return true;
1947			}
1948		case FORMAT_Q8W8V8U8:
1949		case FORMAT_Q16W16V16U16:
1950			return false;
1951		case FORMAT_R32F:
1952			if(component < 1)
1953			{
1954				return false;
1955			}
1956			else
1957			{
1958				return true;
1959			}
1960		case FORMAT_G32R32F:
1961			if(component < 2)
1962			{
1963				return false;
1964			}
1965			else
1966			{
1967				return true;
1968			}
1969		case FORMAT_A32B32G32R32F:
1970			return false;
1971		default:
1972			ASSERT(false);
1973		}
1974
1975		return false;
1976	}
1977
1978	bool Surface::isSRGBreadable(Format format)
1979	{
1980		// Keep in sync with Capabilities::isSRGBreadable
1981		switch(format)
1982		{
1983		case FORMAT_L8:
1984		case FORMAT_A8L8:
1985		case FORMAT_R8G8B8:
1986		case FORMAT_A8R8G8B8:
1987		case FORMAT_X8R8G8B8:
1988		case FORMAT_A8B8G8R8:
1989		case FORMAT_X8B8G8R8:
1990		case FORMAT_R5G6B5:
1991		case FORMAT_X1R5G5B5:
1992		case FORMAT_A1R5G5B5:
1993		case FORMAT_A4R4G4B4:
1994		#if S3TC_SUPPORT
1995		case FORMAT_DXT1:
1996		case FORMAT_DXT3:
1997		case FORMAT_DXT5:
1998		case FORMAT_ATI1:
1999		case FORMAT_ATI2:
2000		#endif
2001			return true;
2002		default:
2003			return false;
2004		}
2005
2006		return false;
2007	}
2008
2009	bool Surface::isSRGBwritable(Format format)
2010	{
2011		// Keep in sync with Capabilities::isSRGBwritable
2012		switch(format)
2013		{
2014		case FORMAT_NULL:
2015		case FORMAT_A8R8G8B8:
2016		case FORMAT_X8R8G8B8:
2017		case FORMAT_A8B8G8R8:
2018		case FORMAT_X8B8G8R8:
2019		case FORMAT_R5G6B5:
2020			return true;
2021		default:
2022			return false;
2023		}
2024	}
2025
2026	bool Surface::isCompressed(Format format)
2027	{
2028		switch(format)
2029		{
2030		#if S3TC_SUPPORT
2031		case FORMAT_DXT1:
2032		case FORMAT_DXT3:
2033		case FORMAT_DXT5:
2034		case FORMAT_ATI1:
2035		case FORMAT_ATI2:
2036			return true;
2037		#endif
2038		default:
2039			return false;
2040		}
2041	}
2042
2043	int Surface::componentCount(Format format)
2044	{
2045		switch(format)
2046		{
2047		case FORMAT_X8R8G8B8:		return 3;
2048		case FORMAT_A8R8G8B8:		return 4;
2049		case FORMAT_G8R8:			return 2;
2050		case FORMAT_G16R16:			return 2;
2051		case FORMAT_A16B16G16R16:	return 4;
2052		case FORMAT_V8U8:			return 2;
2053		case FORMAT_Q8W8V8U8:		return 4;
2054		case FORMAT_X8L8V8U8:		return 3;
2055		case FORMAT_V16U16:			return 2;
2056		case FORMAT_A16W16V16U16:	return 4;
2057		case FORMAT_Q16W16V16U16:	return 4;
2058		case FORMAT_R32F:			return 1;
2059		case FORMAT_G32R32F:		return 2;
2060		case FORMAT_A32B32G32R32F:	return 4;
2061		case FORMAT_D32F_LOCKABLE:	return 1;
2062		case FORMAT_D32F_TEXTURE:	return 1;
2063		case FORMAT_D32F_SHADOW:	return 1;
2064		case FORMAT_A8:				return 1;
2065		case FORMAT_R8:				return 1;
2066		case FORMAT_L8:				return 1;
2067		case FORMAT_L16:			return 1;
2068		case FORMAT_A8L8:			return 2;
2069		default:
2070			ASSERT(false);
2071		}
2072
2073		return 1;
2074	}
2075
2076	void *Surface::allocateBuffer(int width, int height, int depth, Format format)
2077	{
2078		int width4 = (width + 3) & ~3;
2079		int height4 = (height + 3) & ~3;
2080
2081		return allocate(size(width4, height4, depth, format));
2082	}
2083
2084	void Surface::memfill(void *buffer, int pattern, int bytes)
2085	{
2086		while((size_t)buffer & 0x1 && bytes >= 1)
2087		{
2088			*(char*)buffer = (char)pattern;
2089			(char*&)buffer += 1;
2090			bytes -= 1;
2091		}
2092
2093		while((size_t)buffer & 0x3 && bytes >= 2)
2094		{
2095			*(short*)buffer = (short)pattern;
2096			(short*&)buffer += 1;
2097			bytes -= 2;
2098		}
2099
2100		if(CPUID::supportsSSE())
2101		{
2102			while((size_t)buffer & 0xF && bytes >= 4)
2103			{
2104				*(int*)buffer = pattern;
2105				(int*&)buffer += 1;
2106				bytes -= 4;
2107			}
2108
2109			__m128 quad = _mm_set_ps1((float&)pattern);
2110
2111			float *pointer = (float*)buffer;
2112			int qxwords = bytes / 64;
2113			bytes -= qxwords * 64;
2114
2115			while(qxwords--)
2116			{
2117				_mm_stream_ps(pointer + 0, quad);
2118				_mm_stream_ps(pointer + 4, quad);
2119				_mm_stream_ps(pointer + 8, quad);
2120				_mm_stream_ps(pointer + 12, quad);
2121
2122				pointer += 16;
2123			}
2124
2125			buffer = pointer;
2126		}
2127
2128		while(bytes >= 4)
2129		{
2130			*(int*)buffer = (int)pattern;
2131			(int*&)buffer += 1;
2132			bytes -= 4;
2133		}
2134
2135		while(bytes >= 2)
2136		{
2137			*(short*)buffer = (short)pattern;
2138			(short*&)buffer += 1;
2139			bytes -= 2;
2140		}
2141
2142		while(bytes >= 1)
2143		{
2144			*(char*)buffer = (char)pattern;
2145			(char*&)buffer += 1;
2146			bytes -= 1;
2147		}
2148	}
2149
2150	void Surface::clearColorBuffer(unsigned int color, unsigned int rgbaMask, int x0, int y0, int width, int height)
2151	{
2152		// FIXME: Also clear buffers in other formats?
2153
2154		// Not overlapping
2155		if(x0 > internal.width) return;
2156		if(y0 > internal.height) return;
2157		if(x0 + width < 0) return;
2158		if(y0 + height < 0) return;
2159
2160		// Clip against dimensions
2161		if(x0 < 0) {width += x0; x0 = 0;}
2162		if(x0 + width > internal.width) width = internal.width - x0;
2163		if(y0 < 0) {height += y0; y0 = 0;}
2164		if(y0 + height > internal.height) height = internal.height - y0;
2165
2166		const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
2167		const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
2168
2169		int width2 = (internal.width + 1) & ~1;
2170
2171		int x1 = x0 + width;
2172		int y1 = y0 + height;
2173
2174		int bytes = 4 * (x1 - x0);
2175
2176	//	if(lockable || !quadLayoutEnabled)
2177		{
2178			unsigned char *buffer = (unsigned char*)lockInternal(x0, y0, 0, lock, PUBLIC);
2179
2180			unsigned char r8 = (color & 0x00FF0000) >> 16;
2181			unsigned char g8 = (color & 0x0000FF00) >> 8;
2182			unsigned char b8 = (color & 0x000000FF) >> 0;
2183			unsigned char a8 = (color & 0xFF000000) >> 24;
2184
2185			unsigned short r16 = (r8 << 8) + r8;
2186			unsigned short g16 = (g8 << 8) + g8;
2187			unsigned short b16 = (b8 << 8) + b8;
2188			unsigned short a16 = (a8 << 8) + a8;
2189
2190			float r32f = r8 / 255.0f;
2191			float g32f = g8 / 255.0f;
2192			float b32f = b8 / 255.0f;
2193			float a32f = a8 / 255.0f;
2194
2195			unsigned char g8r8[4] = {r8, g8, r8, g8};
2196			unsigned short g16r16[2] = {r16, g16};
2197
2198			for(int z = 0; z < internal.depth; z++)
2199			{
2200				unsigned char *target = buffer;
2201
2202				for(int y = y0; y < y1; y++)
2203				{
2204					switch(internal.format)
2205					{
2206					case FORMAT_NULL:
2207						break;
2208					case FORMAT_X8R8G8B8:
2209					case FORMAT_A8R8G8B8:
2210				//	case FORMAT_X8G8R8B8Q:   // FIXME
2211				//	case FORMAT_A8G8R8B8Q:   // FIXME
2212						if(rgbaMask == 0xF || (internal.format == FORMAT_X8R8G8B8 && rgbaMask == 0x7))
2213						{
2214							memfill(target, color, 4 * (x1 - x0));
2215						}
2216						else
2217						{
2218							unsigned int bgraMask = (rgbaMask & 0x1 ? 0x00FF0000 : 0) | (rgbaMask & 0x2 ? 0x0000FF00 : 0) | (rgbaMask & 0x4 ? 0x000000FF : 0) | (rgbaMask & 0x8 ? 0xFF000000 : 0);
2219							unsigned int invMask = ~bgraMask;
2220							unsigned int maskedColor = color & bgraMask;
2221							unsigned int *target32 = (unsigned int*)target;
2222
2223							for(int x = 0; x < width; x++)
2224							{
2225								target32[x] = maskedColor | (target32[x] & invMask);
2226							}
2227						}
2228						break;
2229					case FORMAT_G8R8:
2230						if((rgbaMask & 0x3) == 0x3)
2231						{
2232							memfill(target, (int&)g8r8, 2 * (x1 - x0));
2233						}
2234						else
2235						{
2236							unsigned short rgMask = (rgbaMask & 0x1 ? 0x000000FF : 0) | (rgbaMask & 0x2 ? 0x0000FF00 : 0);
2237							unsigned short invMask = ~rgMask;
2238							unsigned short maskedColor = (unsigned short&)g8r8 & rgMask;
2239							unsigned short *target16 = (unsigned short*)target;
2240
2241							for(int x = 0; x < width; x++)
2242							{
2243								target16[x] = maskedColor | (target16[x] & invMask);
2244							}
2245						}
2246						break;
2247					case FORMAT_G16R16:
2248						if((rgbaMask & 0x3) == 0x3)
2249						{
2250							memfill(target, (int&)g16r16, 4 * (x1 - x0));
2251						}
2252						else
2253						{
2254							unsigned int rgMask = (rgbaMask & 0x1 ? 0x0000FFFF : 0) | (rgbaMask & 0x2 ? 0xFFFF0000 : 0);
2255							unsigned int invMask = ~rgMask;
2256							unsigned int maskedColor = (unsigned int&)g16r16 & rgMask;
2257							unsigned int *target32 = (unsigned int*)target;
2258
2259							for(int x = 0; x < width; x++)
2260							{
2261								target32[x] = maskedColor | (target32[x] & invMask);
2262							}
2263						}
2264						break;
2265					case FORMAT_A16B16G16R16:
2266						if(rgbaMask == 0xF)
2267						{
2268							for(int x = 0; x < width; x++)
2269							{
2270								((unsigned short*)target)[4 * x + 0] = r16;
2271								((unsigned short*)target)[4 * x + 1] = g16;
2272								((unsigned short*)target)[4 * x + 2] = b16;
2273								((unsigned short*)target)[4 * x + 3] = a16;
2274							}
2275						}
2276						else
2277						{
2278							if(rgbaMask & 0x1) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 0] = r16;
2279							if(rgbaMask & 0x2) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 1] = g16;
2280							if(rgbaMask & 0x4) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 2] = b16;
2281							if(rgbaMask & 0x8) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 3] = a16;
2282						}
2283						break;
2284					case FORMAT_R32F:
2285						if(rgbaMask & 0x1)
2286						{
2287							for(int x = 0; x < width; x++)
2288							{
2289								((float*)target)[x] = r32f;
2290							}
2291						}
2292						break;
2293					case FORMAT_G32R32F:
2294						if((rgbaMask & 0x3) == 0x3)
2295						{
2296							for(int x = 0; x < width; x++)
2297							{
2298								((float*)target)[2 * x + 0] = r32f;
2299								((float*)target)[2 * x + 1] = g32f;
2300							}
2301						}
2302						else
2303						{
2304							if(rgbaMask & 0x1) for(int x = 0; x < width; x++) ((float*)target)[2 * x + 0] = r32f;
2305							if(rgbaMask & 0x2) for(int x = 0; x < width; x++) ((float*)target)[2 * x + 1] = g32f;
2306						}
2307						break;
2308					case FORMAT_A32B32G32R32F:
2309						if(rgbaMask == 0xF)
2310						{
2311							for(int x = 0; x < width; x++)
2312							{
2313								((float*)target)[4 * x + 0] = r32f;
2314								((float*)target)[4 * x + 1] = g32f;
2315								((float*)target)[4 * x + 2] = b32f;
2316								((float*)target)[4 * x + 3] = a32f;
2317							}
2318						}
2319						else
2320						{
2321							if(rgbaMask & 0x1) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 0] = r32f;
2322							if(rgbaMask & 0x2) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 1] = g32f;
2323							if(rgbaMask & 0x4) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 2] = b32f;
2324							if(rgbaMask & 0x8) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 3] = a32f;
2325						}
2326						break;
2327					default:
2328						ASSERT(false);
2329					}
2330
2331					target += internal.pitchB;
2332				}
2333
2334				buffer += internal.sliceB;
2335			}
2336
2337			unlockInternal();
2338		}
2339	/*	else
2340		{
2341		//	unsigned char *target = (unsigned char*&)buffer;
2342		//
2343		//	for(int y = y0; y < y1; y++)
2344		//	{
2345		//		for(int x = x0; x < x1; x++)
2346		//		{
2347		//			target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 0] =  (color & 0x000000FF) >> 0;
2348		//			target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 4] =  (color & 0x00FF0000) >> 16;
2349		//			target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 8] =  (color & 0x0000FF00) >> 8;
2350		//			target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 12] = (color & 0xFF000000) >> 24;
2351		//		}
2352		//	}
2353
2354			unsigned char colorQ[16];
2355
2356			colorQ[0] =  (color & 0x000000FF) >> 0;
2357			colorQ[1] =  (color & 0x000000FF) >> 0;
2358			colorQ[2] =  (color & 0x000000FF) >> 0;
2359			colorQ[3] =  (color & 0x000000FF) >> 0;
2360			colorQ[4] =  (color & 0x00FF0000) >> 16;
2361			colorQ[5] =  (color & 0x00FF0000) >> 16;
2362			colorQ[6] =  (color & 0x00FF0000) >> 16;
2363			colorQ[7] =  (color & 0x00FF0000) >> 16;
2364			colorQ[8] =  (color & 0x0000FF00) >> 8;
2365			colorQ[9] =  (color & 0x0000FF00) >> 8;
2366			colorQ[10] = (color & 0x0000FF00) >> 8;
2367			colorQ[11] = (color & 0x0000FF00) >> 8;
2368			colorQ[12] = (color & 0xFF000000) >> 24;
2369			colorQ[13] = (color & 0xFF000000) >> 24;
2370			colorQ[14] = (color & 0xFF000000) >> 24;
2371			colorQ[15] = (color & 0xFF000000) >> 24;
2372
2373			for(int y = y0; y < y1; y++)
2374			{
2375				unsigned char *target = (unsigned char*)lockInternal(0, 0, 0, lock) + width2 * 4 * (y & ~1) + 2 * (y & 1);   // FIXME: Unlock
2376
2377				if((y & 1) == 0 && y + 1 < y1)   // Fill quad line at once
2378				{
2379					if((x0 & 1) != 0)
2380					{
2381						target[8 * (x0 & ~1) + 1 + 0] =  (color & 0x000000FF) >> 0;
2382						target[8 * (x0 & ~1) + 1 + 4] =  (color & 0x00FF0000) >> 16;
2383						target[8 * (x0 & ~1) + 1 + 8] =  (color & 0x0000FF00) >> 8;
2384						target[8 * (x0 & ~1) + 1 + 12] = (color & 0xFF000000) >> 24;
2385
2386						target[8 * (x0 & ~1) + 3 + 0] =  (color & 0x000000FF) >> 0;
2387						target[8 * (x0 & ~1) + 3 + 4] =  (color & 0x00FF0000) >> 16;
2388						target[8 * (x0 & ~1) + 3 + 8] =  (color & 0x0000FF00) >> 8;
2389						target[8 * (x0 & ~1) + 3 + 12] = (color & 0xFF000000) >> 24;
2390					}
2391
2392					__asm
2393					{
2394						movq mm0, colorQ+0
2395						movq mm1, colorQ+8
2396
2397						mov eax, x0
2398						add eax, 1
2399						and eax, 0xFFFFFFFE
2400						cmp eax, x1
2401						jge qEnd
2402
2403						mov edi, target
2404
2405					qLoop:
2406						movntq [edi+8*eax+0], mm0
2407						movntq [edi+8*eax+8], mm1
2408
2409						add eax, 2
2410						cmp eax, x1
2411						jl qLoop
2412					qEnd:
2413						emms
2414					}
2415
2416					if((x1 & 1) != 0)
2417					{
2418						target[8 * (x1 & ~1) + 0 + 0] =  (color & 0x000000FF) >> 0;
2419						target[8 * (x1 & ~1) + 0 + 4] =  (color & 0x00FF0000) >> 16;
2420						target[8 * (x1 & ~1) + 0 + 8] =  (color & 0x0000FF00) >> 8;
2421						target[8 * (x1 & ~1) + 0 + 12] = (color & 0xFF000000) >> 24;
2422
2423						target[8 * (x1 & ~1) + 2 + 0] =  (color & 0x000000FF) >> 0;
2424						target[8 * (x1 & ~1) + 2 + 4] =  (color & 0x00FF0000) >> 16;
2425						target[8 * (x1 & ~1) + 2 + 8] =  (color & 0x0000FF00) >> 8;
2426						target[8 * (x1 & ~1) + 2 + 12] = (color & 0xFF000000) >> 24;
2427					}
2428
2429					y++;
2430				}
2431				else
2432				{
2433					for(int x = x0; x < x1; x++)
2434					{
2435						target[8 * (x & ~1) + (x & 1) + 0] =  (color & 0x000000FF) >> 0;
2436						target[8 * (x & ~1) + (x & 1) + 4] =  (color & 0x00FF0000) >> 16;
2437						target[8 * (x & ~1) + (x & 1) + 8] =  (color & 0x0000FF00) >> 8;
2438						target[8 * (x & ~1) + (x & 1) + 12] = (color & 0xFF000000) >> 24;
2439					}
2440				}
2441			}
2442		}*/
2443	}
2444
2445	void Surface::clearDepthBuffer(float depth, int x0, int y0, int width, int height)
2446	{
2447		// Not overlapping
2448		if(x0 > internal.width) return;
2449		if(y0 > internal.height) return;
2450		if(x0 + width < 0) return;
2451		if(y0 + height < 0) return;
2452
2453		// Clip against dimensions
2454		if(x0 < 0) {width += x0; x0 = 0;}
2455		if(x0 + width > internal.width) width = internal.width - x0;
2456		if(y0 < 0) {height += y0; y0 = 0;}
2457		if(y0 + height > internal.height) height = internal.height - y0;
2458
2459		const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
2460		const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
2461
2462		int width2 = (internal.width + 1) & ~1;
2463
2464		int x1 = x0 + width;
2465		int y1 = y0 + height;
2466
2467		if(internal.format == FORMAT_D32F_LOCKABLE ||
2468		   internal.format == FORMAT_D32F_TEXTURE ||
2469		   internal.format == FORMAT_D32F_SHADOW)
2470		{
2471			float *target = (float*)lockInternal(0, 0, 0, lock, PUBLIC) + x0 + width2 * y0;
2472
2473			for(int z = 0; z < internal.depth; z++)
2474			{
2475				for(int y = y0; y < y1; y++)
2476				{
2477					memfill(target, (int&)depth, 4 * width);
2478					target += width2;
2479				}
2480			}
2481
2482			unlockInternal();
2483		}
2484		else   // Quad layout
2485		{
2486			if(complementaryDepthBuffer)
2487			{
2488				depth = 1 - depth;
2489			}
2490
2491			float *buffer = (float*)lockInternal(0, 0, 0, lock, PUBLIC);
2492
2493			for(int z = 0; z < internal.depth; z++)
2494			{
2495				for(int y = y0; y < y1; y++)
2496				{
2497					float *target = buffer + (y & ~1) * width2 + (y & 1) * 2;
2498
2499					if((y & 1) == 0 && y + 1 < y1)   // Fill quad line at once
2500					{
2501						if((x0 & 1) != 0)
2502						{
2503							target[(x0 & ~1) * 2 + 1] = depth;
2504							target[(x0 & ~1) * 2 + 3] = depth;
2505						}
2506
2507					//	for(int x2 = ((x0 + 1) & ~1) * 2; x2 < x1 * 2; x2 += 4)
2508					//	{
2509					//		target[x2 + 0] = depth;
2510					//		target[x2 + 1] = depth;
2511					//		target[x2 + 2] = depth;
2512					//		target[x2 + 3] = depth;
2513					//	}
2514
2515					//	__asm
2516					//	{
2517					//		movss xmm0, depth
2518					//		shufps xmm0, xmm0, 0x00
2519					//
2520					//		mov eax, x0
2521					//		add eax, 1
2522					//		and eax, 0xFFFFFFFE
2523					//		cmp eax, x1
2524					//		jge qEnd
2525					//
2526					//		mov edi, target
2527					//
2528					//	qLoop:
2529					//		movntps [edi+8*eax], xmm0
2530					//
2531					//		add eax, 2
2532					//		cmp eax, x1
2533					//		jl qLoop
2534					//	qEnd:
2535					//	}
2536
2537						memfill(&target[((x0 + 1) & ~1) * 2], (int&)depth, 8 * ((x1 & ~1) - ((x0 + 1) & ~1)));
2538
2539						if((x1 & 1) != 0)
2540						{
2541							target[(x1 & ~1) * 2 + 0] = depth;
2542							target[(x1 & ~1) * 2 + 2] = depth;
2543						}
2544
2545						y++;
2546					}
2547					else
2548					{
2549						for(int x = x0; x < x1; x++)
2550						{
2551							target[(x & ~1) * 2 + (x & 1)] = depth;
2552						}
2553					}
2554				}
2555
2556				buffer += internal.sliceP;
2557			}
2558
2559			unlockInternal();
2560		}
2561	}
2562
2563	void Surface::clearStencilBuffer(unsigned char s, unsigned char mask, int x0, int y0, int width, int height)
2564	{
2565		// Not overlapping
2566		if(x0 > internal.width) return;
2567		if(y0 > internal.height) return;
2568		if(x0 + width < 0) return;
2569		if(y0 + height < 0) return;
2570
2571		// Clip against dimensions
2572		if(x0 < 0) {width += x0; x0 = 0;}
2573		if(x0 + width > internal.width) width = internal.width - x0;
2574		if(y0 < 0) {height += y0; y0 = 0;}
2575		if(y0 + height > internal.height) height = internal.height - y0;
2576
2577		int width2 = (internal.width + 1) & ~1;
2578
2579		int x1 = x0 + width;
2580		int y1 = y0 + height;
2581
2582		unsigned char maskedS = s & mask;
2583		unsigned char invMask = ~mask;
2584		unsigned int fill = maskedS;
2585		fill = fill | (fill << 8) | (fill << 16) + (fill << 24);
2586
2587		if(false)
2588		{
2589			char *target = (char*)lockStencil(0, PUBLIC) + x0 + width2 * y0;
2590
2591			for(int z = 0; z < stencil.depth; z++)
2592			{
2593				for(int y = y0; y < y0 + height; y++)
2594				{
2595					if(mask == 0xFF)
2596					{
2597						memfill(target, fill, width);
2598					}
2599					else
2600					{
2601						for(int x = 0; x < width; x++)
2602						{
2603							target[x] = maskedS | (target[x] & invMask);
2604						}
2605					}
2606
2607					target += width2;
2608				}
2609			}
2610
2611			unlockStencil();
2612		}
2613		else   // Quad layout
2614		{
2615			char *buffer = (char*)lockStencil(0, PUBLIC);
2616
2617			if(mask == 0xFF)
2618			{
2619				for(int z = 0; z < stencil.depth; z++)
2620				{
2621					for(int y = y0; y < y1; y++)
2622					{
2623						char *target = buffer + (y & ~1) * width2 + (y & 1) * 2;
2624
2625						if((y & 1) == 0 && y + 1 < y1 && mask == 0xFF)   // Fill quad line at once
2626						{
2627							if((x0 & 1) != 0)
2628							{
2629								target[(x0 & ~1) * 2 + 1] = fill;
2630								target[(x0 & ~1) * 2 + 3] = fill;
2631							}
2632
2633							memfill(&target[((x0 + 1) & ~1) * 2], fill, ((x1 + 1) & ~1) * 2 - ((x0 + 1) & ~1) * 2);
2634
2635							if((x1 & 1) != 0)
2636							{
2637								target[(x1 & ~1) * 2 + 0] = fill;
2638								target[(x1 & ~1) * 2 + 2] = fill;
2639							}
2640
2641							y++;
2642						}
2643						else
2644						{
2645							for(int x = x0; x < x1; x++)
2646							{
2647								target[(x & ~1) * 2 + (x & 1)] = maskedS | (target[x] & invMask);
2648							}
2649						}
2650					}
2651
2652					buffer += stencil.sliceP;
2653				}
2654			}
2655
2656			unlockStencil();
2657		}
2658	}
2659
2660	void Surface::fill(const Color<float> &color, int x0, int y0, int width, int height)
2661	{
2662		unsigned char *row;
2663		Buffer *buffer;
2664
2665		if(internal.dirty)
2666		{
2667			row = (unsigned char*)lockInternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
2668			buffer = &internal;
2669		}
2670		else
2671		{
2672			row = (unsigned char*)lockExternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
2673			buffer = &external;
2674		}
2675
2676		if(buffer->bytes <= 4)
2677		{
2678			int c;
2679			buffer->write(&c, color);
2680
2681			if(buffer->bytes <= 1) c = (c << 8)  | c;
2682			if(buffer->bytes <= 2) c = (c << 16) | c;
2683
2684			for(int y = 0; y < height; y++)
2685			{
2686				memfill(row, c, width * buffer->bytes);
2687
2688				row += buffer->pitchB;
2689			}
2690		}
2691		else   // Generic
2692		{
2693			for(int y = 0; y < height; y++)
2694			{
2695				unsigned char *element = row;
2696
2697				for(int x = 0; x < width; x++)
2698				{
2699					buffer->write(element, color);
2700
2701					element += buffer->bytes;
2702				}
2703
2704				row += buffer->pitchB;
2705			}
2706		}
2707
2708		if(buffer == &internal)
2709		{
2710			unlockInternal();
2711		}
2712		else
2713		{
2714			unlockExternal();
2715		}
2716	}
2717
2718	Color<float> Surface::readExternal(int x, int y, int z) const
2719	{
2720		ASSERT(external.lock != LOCK_UNLOCKED);
2721
2722		return external.read(x, y, z);
2723	}
2724
2725	Color<float> Surface::readExternal(int x, int y) const
2726	{
2727		ASSERT(external.lock != LOCK_UNLOCKED);
2728
2729		return external.read(x, y);
2730	}
2731
2732	Color<float> Surface::sampleExternal(float x, float y, float z) const
2733	{
2734		ASSERT(external.lock != LOCK_UNLOCKED);
2735
2736		return external.sample(x, y, z);
2737	}
2738
2739	Color<float> Surface::sampleExternal(float x, float y) const
2740	{
2741		ASSERT(external.lock != LOCK_UNLOCKED);
2742
2743		return external.sample(x, y);
2744	}
2745
2746	void Surface::writeExternal(int x, int y, int z, const Color<float> &color)
2747	{
2748		ASSERT(external.lock != LOCK_UNLOCKED);
2749
2750		external.write(x, y, z, color);
2751	}
2752
2753	void Surface::writeExternal(int x, int y, const Color<float> &color)
2754	{
2755		ASSERT(external.lock != LOCK_UNLOCKED);
2756
2757		external.write(x, y, color);
2758	}
2759
2760	Color<float> Surface::readInternal(int x, int y, int z) const
2761	{
2762		ASSERT(internal.lock != LOCK_UNLOCKED);
2763
2764		return internal.read(x, y, z);
2765	}
2766
2767	Color<float> Surface::readInternal(int x, int y) const
2768	{
2769		ASSERT(internal.lock != LOCK_UNLOCKED);
2770
2771		return internal.read(x, y);
2772	}
2773
2774	Color<float> Surface::sampleInternal(float x, float y, float z) const
2775	{
2776		ASSERT(internal.lock != LOCK_UNLOCKED);
2777
2778		return internal.sample(x, y, z);
2779	}
2780
2781	Color<float> Surface::sampleInternal(float x, float y) const
2782	{
2783		ASSERT(internal.lock != LOCK_UNLOCKED);
2784
2785		return internal.sample(x, y);
2786	}
2787
2788	void Surface::writeInternal(int x, int y, int z, const Color<float> &color)
2789	{
2790		ASSERT(internal.lock != LOCK_UNLOCKED);
2791
2792		internal.write(x, y, z, color);
2793	}
2794
2795	void Surface::writeInternal(int x, int y, const Color<float> &color)
2796	{
2797		ASSERT(internal.lock != LOCK_UNLOCKED);
2798
2799		internal.write(x, y, color);
2800	}
2801
2802	bool Surface::hasStencil() const
2803	{
2804		return isStencil(external.format);
2805	}
2806
2807	bool Surface::hasDepth() const
2808	{
2809		return isDepth(external.format);
2810	}
2811
2812	bool Surface::hasPalette() const
2813	{
2814		return isPalette(external.format);
2815	}
2816
2817	bool Surface::isRenderTarget() const
2818	{
2819		return renderTarget;
2820	}
2821
2822	bool Surface::hasDirtyMipmaps() const
2823	{
2824		return dirtyMipmaps;
2825	}
2826
2827	void Surface::cleanMipmaps()
2828	{
2829		dirtyMipmaps = false;
2830	}
2831
2832	Resource *Surface::getResource()
2833	{
2834		return resource;
2835	}
2836
2837	bool Surface::identicalFormats() const
2838	{
2839		return external.format  == internal.format &&
2840		       external.width   == internal.width &&
2841			   external.height  == internal.height &&
2842			   external.depth   == internal.depth &&
2843			   external.pitchB  == internal.pitchB &&
2844			   external.sliceB  == internal.sliceB;
2845	}
2846
2847	Format Surface::selectInternalFormat(Format format) const
2848	{
2849		switch(format)
2850		{
2851		case FORMAT_NULL:
2852			return FORMAT_NULL;
2853		case FORMAT_P8:
2854		case FORMAT_A8P8:
2855		case FORMAT_A4R4G4B4:
2856		case FORMAT_A1R5G5B5:
2857		case FORMAT_A8R3G3B2:
2858			return FORMAT_A8R8G8B8;
2859		case FORMAT_A8:
2860			return FORMAT_A8;
2861		case FORMAT_R8:
2862			return FORMAT_R8;
2863		case FORMAT_A2R10G10B10:
2864		case FORMAT_A2B10G10R10:
2865		case FORMAT_A16B16G16R16:
2866			return FORMAT_A16B16G16R16;
2867		case FORMAT_G8R8:
2868			return FORMAT_G8R8;
2869		case FORMAT_G16R16:
2870			return FORMAT_G16R16;
2871		case FORMAT_A8R8G8B8:
2872		case FORMAT_A8B8G8R8:
2873			if(lockable || !quadLayoutEnabled)
2874			{
2875				return FORMAT_A8R8G8B8;
2876			}
2877			else
2878			{
2879				return FORMAT_A8G8R8B8Q;
2880			}
2881		case FORMAT_R3G3B2:
2882		case FORMAT_R5G6B5:
2883		case FORMAT_R8G8B8:
2884		case FORMAT_X4R4G4B4:
2885		case FORMAT_X1R5G5B5:
2886		case FORMAT_X8R8G8B8:
2887		case FORMAT_X8B8G8R8:
2888			if(lockable || !quadLayoutEnabled)
2889			{
2890				return FORMAT_X8R8G8B8;
2891			}
2892			else
2893			{
2894				return FORMAT_X8G8R8B8Q;
2895			}
2896		// Compressed formats
2897		#if S3TC_SUPPORT
2898		case FORMAT_DXT1:
2899		case FORMAT_DXT3:
2900		case FORMAT_DXT5:
2901			return FORMAT_A8R8G8B8;
2902		case FORMAT_ATI1:
2903			return FORMAT_R8;
2904		case FORMAT_ATI2:
2905			return FORMAT_G8R8;
2906		#endif
2907		// Bumpmap formats
2908		case FORMAT_V8U8:			return FORMAT_V8U8;
2909		case FORMAT_L6V5U5:			return FORMAT_X8L8V8U8;
2910		case FORMAT_Q8W8V8U8:		return FORMAT_Q8W8V8U8;
2911		case FORMAT_X8L8V8U8:		return FORMAT_X8L8V8U8;
2912		case FORMAT_V16U16:			return FORMAT_V16U16;
2913		case FORMAT_A2W10V10U10:	return FORMAT_A16W16V16U16;
2914		case FORMAT_Q16W16V16U16:	return FORMAT_Q16W16V16U16;
2915		// Floating-point formats
2916		case FORMAT_R16F:			return FORMAT_R32F;
2917		case FORMAT_G16R16F:		return FORMAT_G32R32F;
2918		case FORMAT_A16B16G16R16F:	return FORMAT_A32B32G32R32F;
2919		case FORMAT_R32F:			return FORMAT_R32F;
2920		case FORMAT_G32R32F:		return FORMAT_G32R32F;
2921		case FORMAT_A32B32G32R32F:	return FORMAT_A32B32G32R32F;
2922		// Luminance formats
2923		case FORMAT_L8:				return FORMAT_L8;
2924		case FORMAT_A4L4:			return FORMAT_A8L8;
2925		case FORMAT_L16:			return FORMAT_L16;
2926		case FORMAT_A8L8:			return FORMAT_A8L8;
2927		// Depth/stencil formats
2928		case FORMAT_D16:
2929		case FORMAT_D32:
2930		case FORMAT_D24X8:
2931		case FORMAT_D24S8:
2932		case FORMAT_D24FS8:
2933			if(hasParent)   // Texture
2934			{
2935				return FORMAT_D32F_SHADOW;
2936			}
2937			else if(complementaryDepthBuffer)
2938			{
2939				return FORMAT_D32F_COMPLEMENTARY;
2940			}
2941			else
2942			{
2943				return FORMAT_D32F;
2944			}
2945		case FORMAT_D32F_LOCKABLE:	return FORMAT_D32F_LOCKABLE;
2946		case FORMAT_INTZ:			return FORMAT_D32F_TEXTURE;
2947		case FORMAT_DF24:			return FORMAT_D32F_SHADOW;
2948		case FORMAT_DF16:			return FORMAT_D32F_SHADOW;
2949		default:
2950			ASSERT(false);
2951		}
2952
2953		return FORMAT_NULL;
2954	}
2955
2956	void Surface::setTexturePalette(unsigned int *palette)
2957	{
2958		Surface::palette = palette;
2959		Surface::paletteID++;
2960	}
2961
2962	void Surface::resolve()
2963	{
2964		if(internal.depth <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL)
2965		{
2966			return;
2967		}
2968
2969		void *source = internal.lockRect(0, 0, 0, LOCK_READWRITE);
2970
2971		int quality = internal.depth;
2972		int width = internal.width;
2973		int height = internal.height;
2974		int pitch = internal.pitchB;
2975		int slice = internal.sliceB;
2976
2977		unsigned char *source0 = (unsigned char*)source;
2978		unsigned char *source1 = source0 + slice;
2979		unsigned char *source2 = source1 + slice;
2980		unsigned char *source3 = source2 + slice;
2981		unsigned char *source4 = source3 + slice;
2982		unsigned char *source5 = source4 + slice;
2983		unsigned char *source6 = source5 + slice;
2984		unsigned char *source7 = source6 + slice;
2985		unsigned char *source8 = source7 + slice;
2986		unsigned char *source9 = source8 + slice;
2987		unsigned char *sourceA = source9 + slice;
2988		unsigned char *sourceB = sourceA + slice;
2989		unsigned char *sourceC = sourceB + slice;
2990		unsigned char *sourceD = sourceC + slice;
2991		unsigned char *sourceE = sourceD + slice;
2992		unsigned char *sourceF = sourceE + slice;
2993
2994		if(internal.format == FORMAT_X8R8G8B8 || internal.format == FORMAT_A8R8G8B8)
2995		{
2996			if(CPUID::supportsSSE2() && (width % 4) == 0)
2997			{
2998				if(internal.depth == 2)
2999				{
3000					for(int y = 0; y < height; y++)
3001					{
3002						for(int x = 0; x < width; x += 4)
3003						{
3004							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3005							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3006
3007							c0 = _mm_avg_epu8(c0, c1);
3008
3009							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3010						}
3011
3012						source0 += pitch;
3013						source1 += pitch;
3014					}
3015				}
3016				else if(internal.depth == 4)
3017				{
3018					for(int y = 0; y < height; y++)
3019					{
3020						for(int x = 0; x < width; x += 4)
3021						{
3022							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3023							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3024							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3025							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3026
3027							c0 = _mm_avg_epu8(c0, c1);
3028							c2 = _mm_avg_epu8(c2, c3);
3029							c0 = _mm_avg_epu8(c0, c2);
3030
3031							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3032						}
3033
3034						source0 += pitch;
3035						source1 += pitch;
3036						source2 += pitch;
3037						source3 += pitch;
3038					}
3039				}
3040				else if(internal.depth == 8)
3041				{
3042					for(int y = 0; y < height; y++)
3043					{
3044						for(int x = 0; x < width; x += 4)
3045						{
3046							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3047							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3048							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3049							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3050							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3051							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3052							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3053							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3054
3055							c0 = _mm_avg_epu8(c0, c1);
3056							c2 = _mm_avg_epu8(c2, c3);
3057							c4 = _mm_avg_epu8(c4, c5);
3058							c6 = _mm_avg_epu8(c6, c7);
3059							c0 = _mm_avg_epu8(c0, c2);
3060							c4 = _mm_avg_epu8(c4, c6);
3061							c0 = _mm_avg_epu8(c0, c4);
3062
3063							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3064						}
3065
3066						source0 += pitch;
3067						source1 += pitch;
3068						source2 += pitch;
3069						source3 += pitch;
3070						source4 += pitch;
3071						source5 += pitch;
3072						source6 += pitch;
3073						source7 += pitch;
3074					}
3075				}
3076				else if(internal.depth == 16)
3077				{
3078					for(int y = 0; y < height; y++)
3079					{
3080						for(int x = 0; x < width; x += 4)
3081						{
3082							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3083							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3084							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3085							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3086							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3087							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3088							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3089							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3090							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
3091							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
3092							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
3093							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
3094							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
3095							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
3096							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
3097							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
3098
3099							c0 = _mm_avg_epu8(c0, c1);
3100							c2 = _mm_avg_epu8(c2, c3);
3101							c4 = _mm_avg_epu8(c4, c5);
3102							c6 = _mm_avg_epu8(c6, c7);
3103							c8 = _mm_avg_epu8(c8, c9);
3104							cA = _mm_avg_epu8(cA, cB);
3105							cC = _mm_avg_epu8(cC, cD);
3106							cE = _mm_avg_epu8(cE, cF);
3107							c0 = _mm_avg_epu8(c0, c2);
3108							c4 = _mm_avg_epu8(c4, c6);
3109							c8 = _mm_avg_epu8(c8, cA);
3110							cC = _mm_avg_epu8(cC, cE);
3111							c0 = _mm_avg_epu8(c0, c4);
3112							c8 = _mm_avg_epu8(c8, cC);
3113							c0 = _mm_avg_epu8(c0, c8);
3114
3115							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3116						}
3117
3118						source0 += pitch;
3119						source1 += pitch;
3120						source2 += pitch;
3121						source3 += pitch;
3122						source4 += pitch;
3123						source5 += pitch;
3124						source6 += pitch;
3125						source7 += pitch;
3126						source8 += pitch;
3127						source9 += pitch;
3128						sourceA += pitch;
3129						sourceB += pitch;
3130						sourceC += pitch;
3131						sourceD += pitch;
3132						sourceE += pitch;
3133						sourceF += pitch;
3134					}
3135				}
3136				else ASSERT(false);
3137			}
3138			else
3139			{
3140				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))
3141
3142				if(internal.depth == 2)
3143				{
3144					for(int y = 0; y < height; y++)
3145					{
3146						for(int x = 0; x < width; x++)
3147						{
3148							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3149							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3150
3151							c0 = AVERAGE(c0, c1);
3152
3153							*(unsigned int*)(source0 + 4 * x) = c0;
3154						}
3155
3156						source0 += pitch;
3157						source1 += pitch;
3158					}
3159				}
3160				else if(internal.depth == 4)
3161				{
3162					for(int y = 0; y < height; y++)
3163					{
3164						for(int x = 0; x < width; x++)
3165						{
3166							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3167							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3168							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3169							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3170
3171							c0 = AVERAGE(c0, c1);
3172							c2 = AVERAGE(c2, c3);
3173							c0 = AVERAGE(c0, c2);
3174
3175							*(unsigned int*)(source0 + 4 * x) = c0;
3176						}
3177
3178						source0 += pitch;
3179						source1 += pitch;
3180						source2 += pitch;
3181						source3 += pitch;
3182					}
3183				}
3184				else if(internal.depth == 8)
3185				{
3186					for(int y = 0; y < height; y++)
3187					{
3188						for(int x = 0; x < width; x++)
3189						{
3190							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3191							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3192							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3193							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3194							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3195							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3196							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3197							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3198
3199							c0 = AVERAGE(c0, c1);
3200							c2 = AVERAGE(c2, c3);
3201							c4 = AVERAGE(c4, c5);
3202							c6 = AVERAGE(c6, c7);
3203							c0 = AVERAGE(c0, c2);
3204							c4 = AVERAGE(c4, c6);
3205							c0 = AVERAGE(c0, c4);
3206
3207							*(unsigned int*)(source0 + 4 * x) = c0;
3208						}
3209
3210						source0 += pitch;
3211						source1 += pitch;
3212						source2 += pitch;
3213						source3 += pitch;
3214						source4 += pitch;
3215						source5 += pitch;
3216						source6 += pitch;
3217						source7 += pitch;
3218					}
3219				}
3220				else if(internal.depth == 16)
3221				{
3222					for(int y = 0; y < height; y++)
3223					{
3224						for(int x = 0; x < width; x++)
3225						{
3226							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3227							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3228							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3229							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3230							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3231							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3232							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3233							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3234							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
3235							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
3236							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
3237							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
3238							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
3239							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
3240							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
3241							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
3242
3243							c0 = AVERAGE(c0, c1);
3244							c2 = AVERAGE(c2, c3);
3245							c4 = AVERAGE(c4, c5);
3246							c6 = AVERAGE(c6, c7);
3247							c8 = AVERAGE(c8, c9);
3248							cA = AVERAGE(cA, cB);
3249							cC = AVERAGE(cC, cD);
3250							cE = AVERAGE(cE, cF);
3251							c0 = AVERAGE(c0, c2);
3252							c4 = AVERAGE(c4, c6);
3253							c8 = AVERAGE(c8, cA);
3254							cC = AVERAGE(cC, cE);
3255							c0 = AVERAGE(c0, c4);
3256							c8 = AVERAGE(c8, cC);
3257							c0 = AVERAGE(c0, c8);
3258
3259							*(unsigned int*)(source0 + 4 * x) = c0;
3260						}
3261
3262						source0 += pitch;
3263						source1 += pitch;
3264						source2 += pitch;
3265						source3 += pitch;
3266						source4 += pitch;
3267						source5 += pitch;
3268						source6 += pitch;
3269						source7 += pitch;
3270						source8 += pitch;
3271						source9 += pitch;
3272						sourceA += pitch;
3273						sourceB += pitch;
3274						sourceC += pitch;
3275						sourceD += pitch;
3276						sourceE += pitch;
3277						sourceF += pitch;
3278					}
3279				}
3280				else ASSERT(false);
3281
3282				#undef AVERAGE
3283			}
3284		}
3285		else if(internal.format == FORMAT_G16R16)
3286		{
3287			if(CPUID::supportsSSE2() && (width % 4) == 0)
3288			{
3289				if(internal.depth == 2)
3290				{
3291					for(int y = 0; y < height; y++)
3292					{
3293						for(int x = 0; x < width; x += 4)
3294						{
3295							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3296							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3297
3298							c0 = _mm_avg_epu16(c0, c1);
3299
3300							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3301						}
3302
3303						source0 += pitch;
3304						source1 += pitch;
3305					}
3306				}
3307				else if(internal.depth == 4)
3308				{
3309					for(int y = 0; y < height; y++)
3310					{
3311						for(int x = 0; x < width; x += 4)
3312						{
3313							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3314							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3315							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3316							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3317
3318							c0 = _mm_avg_epu16(c0, c1);
3319							c2 = _mm_avg_epu16(c2, c3);
3320							c0 = _mm_avg_epu16(c0, c2);
3321
3322							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3323						}
3324
3325						source0 += pitch;
3326						source1 += pitch;
3327						source2 += pitch;
3328						source3 += pitch;
3329					}
3330				}
3331				else if(internal.depth == 8)
3332				{
3333					for(int y = 0; y < height; y++)
3334					{
3335						for(int x = 0; x < width; x += 4)
3336						{
3337							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3338							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3339							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3340							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3341							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3342							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3343							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3344							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3345
3346							c0 = _mm_avg_epu16(c0, c1);
3347							c2 = _mm_avg_epu16(c2, c3);
3348							c4 = _mm_avg_epu16(c4, c5);
3349							c6 = _mm_avg_epu16(c6, c7);
3350							c0 = _mm_avg_epu16(c0, c2);
3351							c4 = _mm_avg_epu16(c4, c6);
3352							c0 = _mm_avg_epu16(c0, c4);
3353
3354							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3355						}
3356
3357						source0 += pitch;
3358						source1 += pitch;
3359						source2 += pitch;
3360						source3 += pitch;
3361						source4 += pitch;
3362						source5 += pitch;
3363						source6 += pitch;
3364						source7 += pitch;
3365					}
3366				}
3367				else if(internal.depth == 16)
3368				{
3369					for(int y = 0; y < height; y++)
3370					{
3371						for(int x = 0; x < width; x += 4)
3372						{
3373							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3374							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3375							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3376							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3377							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3378							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3379							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3380							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3381							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
3382							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
3383							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
3384							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
3385							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
3386							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
3387							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
3388							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
3389
3390							c0 = _mm_avg_epu16(c0, c1);
3391							c2 = _mm_avg_epu16(c2, c3);
3392							c4 = _mm_avg_epu16(c4, c5);
3393							c6 = _mm_avg_epu16(c6, c7);
3394							c8 = _mm_avg_epu16(c8, c9);
3395							cA = _mm_avg_epu16(cA, cB);
3396							cC = _mm_avg_epu16(cC, cD);
3397							cE = _mm_avg_epu16(cE, cF);
3398							c0 = _mm_avg_epu16(c0, c2);
3399							c4 = _mm_avg_epu16(c4, c6);
3400							c8 = _mm_avg_epu16(c8, cA);
3401							cC = _mm_avg_epu16(cC, cE);
3402							c0 = _mm_avg_epu16(c0, c4);
3403							c8 = _mm_avg_epu16(c8, cC);
3404							c0 = _mm_avg_epu16(c0, c8);
3405
3406							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3407						}
3408
3409						source0 += pitch;
3410						source1 += pitch;
3411						source2 += pitch;
3412						source3 += pitch;
3413						source4 += pitch;
3414						source5 += pitch;
3415						source6 += pitch;
3416						source7 += pitch;
3417						source8 += pitch;
3418						source9 += pitch;
3419						sourceA += pitch;
3420						sourceB += pitch;
3421						sourceC += pitch;
3422						sourceD += pitch;
3423						sourceE += pitch;
3424						sourceF += pitch;
3425					}
3426				}
3427				else ASSERT(false);
3428			}
3429			else
3430			{
3431				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
3432
3433				if(internal.depth == 2)
3434				{
3435					for(int y = 0; y < height; y++)
3436					{
3437						for(int x = 0; x < width; x++)
3438						{
3439							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3440							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3441
3442							c0 = AVERAGE(c0, c1);
3443
3444							*(unsigned int*)(source0 + 4 * x) = c0;
3445						}
3446
3447						source0 += pitch;
3448						source1 += pitch;
3449					}
3450				}
3451				else if(internal.depth == 4)
3452				{
3453					for(int y = 0; y < height; y++)
3454					{
3455						for(int x = 0; x < width; x++)
3456						{
3457							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3458							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3459							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3460							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3461
3462							c0 = AVERAGE(c0, c1);
3463							c2 = AVERAGE(c2, c3);
3464							c0 = AVERAGE(c0, c2);
3465
3466							*(unsigned int*)(source0 + 4 * x) = c0;
3467						}
3468
3469						source0 += pitch;
3470						source1 += pitch;
3471						source2 += pitch;
3472						source3 += pitch;
3473					}
3474				}
3475				else if(internal.depth == 8)
3476				{
3477					for(int y = 0; y < height; y++)
3478					{
3479						for(int x = 0; x < width; x++)
3480						{
3481							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3482							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3483							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3484							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3485							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3486							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3487							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3488							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3489
3490							c0 = AVERAGE(c0, c1);
3491							c2 = AVERAGE(c2, c3);
3492							c4 = AVERAGE(c4, c5);
3493							c6 = AVERAGE(c6, c7);
3494							c0 = AVERAGE(c0, c2);
3495							c4 = AVERAGE(c4, c6);
3496							c0 = AVERAGE(c0, c4);
3497
3498							*(unsigned int*)(source0 + 4 * x) = c0;
3499						}
3500
3501						source0 += pitch;
3502						source1 += pitch;
3503						source2 += pitch;
3504						source3 += pitch;
3505						source4 += pitch;
3506						source5 += pitch;
3507						source6 += pitch;
3508						source7 += pitch;
3509					}
3510				}
3511				else if(internal.depth == 16)
3512				{
3513					for(int y = 0; y < height; y++)
3514					{
3515						for(int x = 0; x < width; x++)
3516						{
3517							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3518							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3519							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3520							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3521							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3522							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3523							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3524							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3525							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
3526							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
3527							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
3528							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
3529							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
3530							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
3531							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
3532							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
3533
3534							c0 = AVERAGE(c0, c1);
3535							c2 = AVERAGE(c2, c3);
3536							c4 = AVERAGE(c4, c5);
3537							c6 = AVERAGE(c6, c7);
3538							c8 = AVERAGE(c8, c9);
3539							cA = AVERAGE(cA, cB);
3540							cC = AVERAGE(cC, cD);
3541							cE = AVERAGE(cE, cF);
3542							c0 = AVERAGE(c0, c2);
3543							c4 = AVERAGE(c4, c6);
3544							c8 = AVERAGE(c8, cA);
3545							cC = AVERAGE(cC, cE);
3546							c0 = AVERAGE(c0, c4);
3547							c8 = AVERAGE(c8, cC);
3548							c0 = AVERAGE(c0, c8);
3549
3550							*(unsigned int*)(source0 + 4 * x) = c0;
3551						}
3552
3553						source0 += pitch;
3554						source1 += pitch;
3555						source2 += pitch;
3556						source3 += pitch;
3557						source4 += pitch;
3558						source5 += pitch;
3559						source6 += pitch;
3560						source7 += pitch;
3561						source8 += pitch;
3562						source9 += pitch;
3563						sourceA += pitch;
3564						sourceB += pitch;
3565						sourceC += pitch;
3566						sourceD += pitch;
3567						sourceE += pitch;
3568						sourceF += pitch;
3569					}
3570				}
3571				else ASSERT(false);
3572
3573				#undef AVERAGE
3574			}
3575		}
3576		else if(internal.format == FORMAT_A16B16G16R16)
3577		{
3578			if(CPUID::supportsSSE2() && (width % 2) == 0)
3579			{
3580				if(internal.depth == 2)
3581				{
3582					for(int y = 0; y < height; y++)
3583					{
3584						for(int x = 0; x < width; x += 2)
3585						{
3586							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
3587							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
3588
3589							c0 = _mm_avg_epu16(c0, c1);
3590
3591							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
3592						}
3593
3594						source0 += pitch;
3595						source1 += pitch;
3596					}
3597				}
3598				else if(internal.depth == 4)
3599				{
3600					for(int y = 0; y < height; y++)
3601					{
3602						for(int x = 0; x < width; x += 2)
3603						{
3604							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
3605							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
3606							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
3607							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
3608
3609							c0 = _mm_avg_epu16(c0, c1);
3610							c2 = _mm_avg_epu16(c2, c3);
3611							c0 = _mm_avg_epu16(c0, c2);
3612
3613							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
3614						}
3615
3616						source0 += pitch;
3617						source1 += pitch;
3618						source2 += pitch;
3619						source3 += pitch;
3620					}
3621				}
3622				else if(internal.depth == 8)
3623				{
3624					for(int y = 0; y < height; y++)
3625					{
3626						for(int x = 0; x < width; x += 2)
3627						{
3628							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
3629							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
3630							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
3631							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
3632							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
3633							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
3634							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
3635							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
3636
3637							c0 = _mm_avg_epu16(c0, c1);
3638							c2 = _mm_avg_epu16(c2, c3);
3639							c4 = _mm_avg_epu16(c4, c5);
3640							c6 = _mm_avg_epu16(c6, c7);
3641							c0 = _mm_avg_epu16(c0, c2);
3642							c4 = _mm_avg_epu16(c4, c6);
3643							c0 = _mm_avg_epu16(c0, c4);
3644
3645							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
3646						}
3647
3648						source0 += pitch;
3649						source1 += pitch;
3650						source2 += pitch;
3651						source3 += pitch;
3652						source4 += pitch;
3653						source5 += pitch;
3654						source6 += pitch;
3655						source7 += pitch;
3656					}
3657				}
3658				else if(internal.depth == 16)
3659				{
3660					for(int y = 0; y < height; y++)
3661					{
3662						for(int x = 0; x < width; x += 2)
3663						{
3664							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
3665							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
3666							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
3667							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
3668							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
3669							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
3670							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
3671							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
3672							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
3673							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
3674							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
3675							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
3676							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
3677							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
3678							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
3679							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
3680
3681							c0 = _mm_avg_epu16(c0, c1);
3682							c2 = _mm_avg_epu16(c2, c3);
3683							c4 = _mm_avg_epu16(c4, c5);
3684							c6 = _mm_avg_epu16(c6, c7);
3685							c8 = _mm_avg_epu16(c8, c9);
3686							cA = _mm_avg_epu16(cA, cB);
3687							cC = _mm_avg_epu16(cC, cD);
3688							cE = _mm_avg_epu16(cE, cF);
3689							c0 = _mm_avg_epu16(c0, c2);
3690							c4 = _mm_avg_epu16(c4, c6);
3691							c8 = _mm_avg_epu16(c8, cA);
3692							cC = _mm_avg_epu16(cC, cE);
3693							c0 = _mm_avg_epu16(c0, c4);
3694							c8 = _mm_avg_epu16(c8, cC);
3695							c0 = _mm_avg_epu16(c0, c8);
3696
3697							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
3698						}
3699
3700						source0 += pitch;
3701						source1 += pitch;
3702						source2 += pitch;
3703						source3 += pitch;
3704						source4 += pitch;
3705						source5 += pitch;
3706						source6 += pitch;
3707						source7 += pitch;
3708						source8 += pitch;
3709						source9 += pitch;
3710						sourceA += pitch;
3711						sourceB += pitch;
3712						sourceC += pitch;
3713						sourceD += pitch;
3714						sourceE += pitch;
3715						sourceF += pitch;
3716					}
3717				}
3718				else ASSERT(false);
3719			}
3720			else
3721			{
3722				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
3723
3724				if(internal.depth == 2)
3725				{
3726					for(int y = 0; y < height; y++)
3727					{
3728						for(int x = 0; x < 2 * width; x++)
3729						{
3730							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3731							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3732
3733							c0 = AVERAGE(c0, c1);
3734
3735							*(unsigned int*)(source0 + 4 * x) = c0;
3736						}
3737
3738						source0 += pitch;
3739						source1 += pitch;
3740					}
3741				}
3742				else if(internal.depth == 4)
3743				{
3744					for(int y = 0; y < height; y++)
3745					{
3746						for(int x = 0; x < 2 * width; x++)
3747						{
3748							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3749							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3750							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3751							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3752
3753							c0 = AVERAGE(c0, c1);
3754							c2 = AVERAGE(c2, c3);
3755							c0 = AVERAGE(c0, c2);
3756
3757							*(unsigned int*)(source0 + 4 * x) = c0;
3758						}
3759
3760						source0 += pitch;
3761						source1 += pitch;
3762						source2 += pitch;
3763						source3 += pitch;
3764					}
3765				}
3766				else if(internal.depth == 8)
3767				{
3768					for(int y = 0; y < height; y++)
3769					{
3770						for(int x = 0; x < 2 * width; x++)
3771						{
3772							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3773							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3774							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3775							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3776							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3777							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3778							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3779							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3780
3781							c0 = AVERAGE(c0, c1);
3782							c2 = AVERAGE(c2, c3);
3783							c4 = AVERAGE(c4, c5);
3784							c6 = AVERAGE(c6, c7);
3785							c0 = AVERAGE(c0, c2);
3786							c4 = AVERAGE(c4, c6);
3787							c0 = AVERAGE(c0, c4);
3788
3789							*(unsigned int*)(source0 + 4 * x) = c0;
3790						}
3791
3792						source0 += pitch;
3793						source1 += pitch;
3794						source2 += pitch;
3795						source3 += pitch;
3796						source4 += pitch;
3797						source5 += pitch;
3798						source6 += pitch;
3799						source7 += pitch;
3800					}
3801				}
3802				else if(internal.depth == 16)
3803				{
3804					for(int y = 0; y < height; y++)
3805					{
3806						for(int x = 0; x < 2 * width; x++)
3807						{
3808							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3809							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3810							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3811							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3812							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3813							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3814							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3815							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3816							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
3817							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
3818							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
3819							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
3820							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
3821							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
3822							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
3823							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
3824
3825							c0 = AVERAGE(c0, c1);
3826							c2 = AVERAGE(c2, c3);
3827							c4 = AVERAGE(c4, c5);
3828							c6 = AVERAGE(c6, c7);
3829							c8 = AVERAGE(c8, c9);
3830							cA = AVERAGE(cA, cB);
3831							cC = AVERAGE(cC, cD);
3832							cE = AVERAGE(cE, cF);
3833							c0 = AVERAGE(c0, c2);
3834							c4 = AVERAGE(c4, c6);
3835							c8 = AVERAGE(c8, cA);
3836							cC = AVERAGE(cC, cE);
3837							c0 = AVERAGE(c0, c4);
3838							c8 = AVERAGE(c8, cC);
3839							c0 = AVERAGE(c0, c8);
3840
3841							*(unsigned int*)(source0 + 4 * x) = c0;
3842						}
3843
3844						source0 += pitch;
3845						source1 += pitch;
3846						source2 += pitch;
3847						source3 += pitch;
3848						source4 += pitch;
3849						source5 += pitch;
3850						source6 += pitch;
3851						source7 += pitch;
3852						source8 += pitch;
3853						source9 += pitch;
3854						sourceA += pitch;
3855						sourceB += pitch;
3856						sourceC += pitch;
3857						sourceD += pitch;
3858						sourceE += pitch;
3859						sourceF += pitch;
3860					}
3861				}
3862				else ASSERT(false);
3863
3864				#undef AVERAGE
3865			}
3866		}
3867		else if(internal.format == FORMAT_R32F)
3868		{
3869			if(CPUID::supportsSSE() && (width % 4) == 0)
3870			{
3871				if(internal.depth == 2)
3872				{
3873					for(int y = 0; y < height; y++)
3874					{
3875						for(int x = 0; x < width; x += 4)
3876						{
3877							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
3878							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
3879
3880							c0 = _mm_add_ps(c0, c1);
3881							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
3882
3883							_mm_store_ps((float*)(source0 + 4 * x), c0);
3884						}
3885
3886						source0 += pitch;
3887						source1 += pitch;
3888					}
3889				}
3890				else if(internal.depth == 4)
3891				{
3892					for(int y = 0; y < height; y++)
3893					{
3894						for(int x = 0; x < width; x += 4)
3895						{
3896							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
3897							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
3898							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
3899							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
3900
3901							c0 = _mm_add_ps(c0, c1);
3902							c2 = _mm_add_ps(c2, c3);
3903							c0 = _mm_add_ps(c0, c2);
3904							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
3905
3906							_mm_store_ps((float*)(source0 + 4 * x), c0);
3907						}
3908
3909						source0 += pitch;
3910						source1 += pitch;
3911						source2 += pitch;
3912						source3 += pitch;
3913					}
3914				}
3915				else if(internal.depth == 8)
3916				{
3917					for(int y = 0; y < height; y++)
3918					{
3919						for(int x = 0; x < width; x += 4)
3920						{
3921							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
3922							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
3923							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
3924							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
3925							__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
3926							__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
3927							__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
3928							__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
3929
3930							c0 = _mm_add_ps(c0, c1);
3931							c2 = _mm_add_ps(c2, c3);
3932							c4 = _mm_add_ps(c4, c5);
3933							c6 = _mm_add_ps(c6, c7);
3934							c0 = _mm_add_ps(c0, c2);
3935							c4 = _mm_add_ps(c4, c6);
3936							c0 = _mm_add_ps(c0, c4);
3937							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
3938
3939							_mm_store_ps((float*)(source0 + 4 * x), c0);
3940						}
3941
3942						source0 += pitch;
3943						source1 += pitch;
3944						source2 += pitch;
3945						source3 += pitch;
3946						source4 += pitch;
3947						source5 += pitch;
3948						source6 += pitch;
3949						source7 += pitch;
3950					}
3951				}
3952				else if(internal.depth == 16)
3953				{
3954					for(int y = 0; y < height; y++)
3955					{
3956						for(int x = 0; x < width; x += 4)
3957						{
3958							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
3959							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
3960							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
3961							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
3962							__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
3963							__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
3964							__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
3965							__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
3966							__m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
3967							__m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
3968							__m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
3969							__m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
3970							__m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
3971							__m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
3972							__m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
3973							__m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
3974
3975							c0 = _mm_add_ps(c0, c1);
3976							c2 = _mm_add_ps(c2, c3);
3977							c4 = _mm_add_ps(c4, c5);
3978							c6 = _mm_add_ps(c6, c7);
3979							c8 = _mm_add_ps(c8, c9);
3980							cA = _mm_add_ps(cA, cB);
3981							cC = _mm_add_ps(cC, cD);
3982							cE = _mm_add_ps(cE, cF);
3983							c0 = _mm_add_ps(c0, c2);
3984							c4 = _mm_add_ps(c4, c6);
3985							c8 = _mm_add_ps(c8, cA);
3986							cC = _mm_add_ps(cC, cE);
3987							c0 = _mm_add_ps(c0, c4);
3988							c8 = _mm_add_ps(c8, cC);
3989							c0 = _mm_add_ps(c0, c8);
3990							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
3991
3992							_mm_store_ps((float*)(source0 + 4 * x), c0);
3993						}
3994
3995						source0 += pitch;
3996						source1 += pitch;
3997						source2 += pitch;
3998						source3 += pitch;
3999						source4 += pitch;
4000						source5 += pitch;
4001						source6 += pitch;
4002						source7 += pitch;
4003						source8 += pitch;
4004						source9 += pitch;
4005						sourceA += pitch;
4006						sourceB += pitch;
4007						sourceC += pitch;
4008						sourceD += pitch;
4009						sourceE += pitch;
4010						sourceF += pitch;
4011					}
4012				}
4013				else ASSERT(false);
4014			}
4015			else
4016			{
4017				if(internal.depth == 2)
4018				{
4019					for(int y = 0; y < height; y++)
4020					{
4021						for(int x = 0; x < width; x++)
4022						{
4023							float c0 = *(float*)(source0 + 4 * x);
4024							float c1 = *(float*)(source1 + 4 * x);
4025
4026							c0 = c0 + c1;
4027							c0 *= 1.0f / 2.0f;
4028
4029							*(float*)(source0 + 4 * x) = c0;
4030						}
4031
4032						source0 += pitch;
4033						source1 += pitch;
4034					}
4035				}
4036				else if(internal.depth == 4)
4037				{
4038					for(int y = 0; y < height; y++)
4039					{
4040						for(int x = 0; x < width; x++)
4041						{
4042							float c0 = *(float*)(source0 + 4 * x);
4043							float c1 = *(float*)(source1 + 4 * x);
4044							float c2 = *(float*)(source2 + 4 * x);
4045							float c3 = *(float*)(source3 + 4 * x);
4046
4047							c0 = c0 + c1;
4048							c2 = c2 + c3;
4049							c0 = c0 + c2;
4050							c0 *= 1.0f / 4.0f;
4051
4052							*(float*)(source0 + 4 * x) = c0;
4053						}
4054
4055						source0 += pitch;
4056						source1 += pitch;
4057						source2 += pitch;
4058						source3 += pitch;
4059					}
4060				}
4061				else if(internal.depth == 8)
4062				{
4063					for(int y = 0; y < height; y++)
4064					{
4065						for(int x = 0; x < width; x++)
4066						{
4067							float c0 = *(float*)(source0 + 4 * x);
4068							float c1 = *(float*)(source1 + 4 * x);
4069							float c2 = *(float*)(source2 + 4 * x);
4070							float c3 = *(float*)(source3 + 4 * x);
4071							float c4 = *(float*)(source4 + 4 * x);
4072							float c5 = *(float*)(source5 + 4 * x);
4073							float c6 = *(float*)(source6 + 4 * x);
4074							float c7 = *(float*)(source7 + 4 * x);
4075
4076							c0 = c0 + c1;
4077							c2 = c2 + c3;
4078							c4 = c4 + c5;
4079							c6 = c6 + c7;
4080							c0 = c0 + c2;
4081							c4 = c4 + c6;
4082							c0 = c0 + c4;
4083							c0 *= 1.0f / 8.0f;
4084
4085							*(float*)(source0 + 4 * x) = c0;
4086						}
4087
4088						source0 += pitch;
4089						source1 += pitch;
4090						source2 += pitch;
4091						source3 += pitch;
4092						source4 += pitch;
4093						source5 += pitch;
4094						source6 += pitch;
4095						source7 += pitch;
4096					}
4097				}
4098				else if(internal.depth == 16)
4099				{
4100					for(int y = 0; y < height; y++)
4101					{
4102						for(int x = 0; x < width; x++)
4103						{
4104							float c0 = *(float*)(source0 + 4 * x);
4105							float c1 = *(float*)(source1 + 4 * x);
4106							float c2 = *(float*)(source2 + 4 * x);
4107							float c3 = *(float*)(source3 + 4 * x);
4108							float c4 = *(float*)(source4 + 4 * x);
4109							float c5 = *(float*)(source5 + 4 * x);
4110							float c6 = *(float*)(source6 + 4 * x);
4111							float c7 = *(float*)(source7 + 4 * x);
4112							float c8 = *(float*)(source8 + 4 * x);
4113							float c9 = *(float*)(source9 + 4 * x);
4114							float cA = *(float*)(sourceA + 4 * x);
4115							float cB = *(float*)(sourceB + 4 * x);
4116							float cC = *(float*)(sourceC + 4 * x);
4117							float cD = *(float*)(sourceD + 4 * x);
4118							float cE = *(float*)(sourceE + 4 * x);
4119							float cF = *(float*)(sourceF + 4 * x);
4120
4121							c0 = c0 + c1;
4122							c2 = c2 + c3;
4123							c4 = c4 + c5;
4124							c6 = c6 + c7;
4125							c8 = c8 + c9;
4126							cA = cA + cB;
4127							cC = cC + cD;
4128							cE = cE + cF;
4129							c0 = c0 + c2;
4130							c4 = c4 + c6;
4131							c8 = c8 + cA;
4132							cC = cC + cE;
4133							c0 = c0 + c4;
4134							c8 = c8 + cC;
4135							c0 = c0 + c8;
4136							c0 *= 1.0f / 16.0f;
4137
4138							*(float*)(source0 + 4 * x) = c0;
4139						}
4140
4141						source0 += pitch;
4142						source1 += pitch;
4143						source2 += pitch;
4144						source3 += pitch;
4145						source4 += pitch;
4146						source5 += pitch;
4147						source6 += pitch;
4148						source7 += pitch;
4149						source8 += pitch;
4150						source9 += pitch;
4151						sourceA += pitch;
4152						sourceB += pitch;
4153						sourceC += pitch;
4154						sourceD += pitch;
4155						sourceE += pitch;
4156						sourceF += pitch;
4157					}
4158				}
4159				else ASSERT(false);
4160			}
4161		}
4162		else if(internal.format == FORMAT_G32R32F)
4163		{
4164			if(CPUID::supportsSSE() && (width % 2) == 0)
4165			{
4166				if(internal.depth == 2)
4167				{
4168					for(int y = 0; y < height; y++)
4169					{
4170						for(int x = 0; x < width; x += 2)
4171						{
4172							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4173							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4174
4175							c0 = _mm_add_ps(c0, c1);
4176							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
4177
4178							_mm_store_ps((float*)(source0 + 8 * x), c0);
4179						}
4180
4181						source0 += pitch;
4182						source1 += pitch;
4183					}
4184				}
4185				else if(internal.depth == 4)
4186				{
4187					for(int y = 0; y < height; y++)
4188					{
4189						for(int x = 0; x < width; x += 2)
4190						{
4191							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4192							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4193							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4194							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4195
4196							c0 = _mm_add_ps(c0, c1);
4197							c2 = _mm_add_ps(c2, c3);
4198							c0 = _mm_add_ps(c0, c2);
4199							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
4200
4201							_mm_store_ps((float*)(source0 + 8 * x), c0);
4202						}
4203
4204						source0 += pitch;
4205						source1 += pitch;
4206						source2 += pitch;
4207						source3 += pitch;
4208					}
4209				}
4210				else if(internal.depth == 8)
4211				{
4212					for(int y = 0; y < height; y++)
4213					{
4214						for(int x = 0; x < width; x += 2)
4215						{
4216							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4217							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4218							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4219							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4220							__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
4221							__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
4222							__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
4223							__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
4224
4225							c0 = _mm_add_ps(c0, c1);
4226							c2 = _mm_add_ps(c2, c3);
4227							c4 = _mm_add_ps(c4, c5);
4228							c6 = _mm_add_ps(c6, c7);
4229							c0 = _mm_add_ps(c0, c2);
4230							c4 = _mm_add_ps(c4, c6);
4231							c0 = _mm_add_ps(c0, c4);
4232							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
4233
4234							_mm_store_ps((float*)(source0 + 8 * x), c0);
4235						}
4236
4237						source0 += pitch;
4238						source1 += pitch;
4239						source2 += pitch;
4240						source3 += pitch;
4241						source4 += pitch;
4242						source5 += pitch;
4243						source6 += pitch;
4244						source7 += pitch;
4245					}
4246				}
4247				else if(internal.depth == 16)
4248				{
4249					for(int y = 0; y < height; y++)
4250					{
4251						for(int x = 0; x < width; x += 2)
4252						{
4253							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4254							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4255							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4256							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4257							__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
4258							__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
4259							__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
4260							__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
4261							__m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
4262							__m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
4263							__m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
4264							__m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
4265							__m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
4266							__m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
4267							__m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
4268							__m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
4269
4270							c0 = _mm_add_ps(c0, c1);
4271							c2 = _mm_add_ps(c2, c3);
4272							c4 = _mm_add_ps(c4, c5);
4273							c6 = _mm_add_ps(c6, c7);
4274							c8 = _mm_add_ps(c8, c9);
4275							cA = _mm_add_ps(cA, cB);
4276							cC = _mm_add_ps(cC, cD);
4277							cE = _mm_add_ps(cE, cF);
4278							c0 = _mm_add_ps(c0, c2);
4279							c4 = _mm_add_ps(c4, c6);
4280							c8 = _mm_add_ps(c8, cA);
4281							cC = _mm_add_ps(cC, cE);
4282							c0 = _mm_add_ps(c0, c4);
4283							c8 = _mm_add_ps(c8, cC);
4284							c0 = _mm_add_ps(c0, c8);
4285							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
4286
4287							_mm_store_ps((float*)(source0 + 8 * x), c0);
4288						}
4289
4290						source0 += pitch;
4291						source1 += pitch;
4292						source2 += pitch;
4293						source3 += pitch;
4294						source4 += pitch;
4295						source5 += pitch;
4296						source6 += pitch;
4297						source7 += pitch;
4298						source8 += pitch;
4299						source9 += pitch;
4300						sourceA += pitch;
4301						sourceB += pitch;
4302						sourceC += pitch;
4303						sourceD += pitch;
4304						sourceE += pitch;
4305						sourceF += pitch;
4306					}
4307				}
4308				else ASSERT(false);
4309			}
4310			else
4311			{
4312				if(internal.depth == 2)
4313				{
4314					for(int y = 0; y < height; y++)
4315					{
4316						for(int x = 0; x < 2 * width; x++)
4317						{
4318							float c0 = *(float*)(source0 + 4 * x);
4319							float c1 = *(float*)(source1 + 4 * x);
4320
4321							c0 = c0 + c1;
4322							c0 *= 1.0f / 2.0f;
4323
4324							*(float*)(source0 + 4 * x) = c0;
4325						}
4326
4327						source0 += pitch;
4328						source1 += pitch;
4329					}
4330				}
4331				else if(internal.depth == 4)
4332				{
4333					for(int y = 0; y < height; y++)
4334					{
4335						for(int x = 0; x < 2 * width; x++)
4336						{
4337							float c0 = *(float*)(source0 + 4 * x);
4338							float c1 = *(float*)(source1 + 4 * x);
4339							float c2 = *(float*)(source2 + 4 * x);
4340							float c3 = *(float*)(source3 + 4 * x);
4341
4342							c0 = c0 + c1;
4343							c2 = c2 + c3;
4344							c0 = c0 + c2;
4345							c0 *= 1.0f / 4.0f;
4346
4347							*(float*)(source0 + 4 * x) = c0;
4348						}
4349
4350						source0 += pitch;
4351						source1 += pitch;
4352						source2 += pitch;
4353						source3 += pitch;
4354					}
4355				}
4356				else if(internal.depth == 8)
4357				{
4358					for(int y = 0; y < height; y++)
4359					{
4360						for(int x = 0; x < 2 * width; x++)
4361						{
4362							float c0 = *(float*)(source0 + 4 * x);
4363							float c1 = *(float*)(source1 + 4 * x);
4364							float c2 = *(float*)(source2 + 4 * x);
4365							float c3 = *(float*)(source3 + 4 * x);
4366							float c4 = *(float*)(source4 + 4 * x);
4367							float c5 = *(float*)(source5 + 4 * x);
4368							float c6 = *(float*)(source6 + 4 * x);
4369							float c7 = *(float*)(source7 + 4 * x);
4370
4371							c0 = c0 + c1;
4372							c2 = c2 + c3;
4373							c4 = c4 + c5;
4374							c6 = c6 + c7;
4375							c0 = c0 + c2;
4376							c4 = c4 + c6;
4377							c0 = c0 + c4;
4378							c0 *= 1.0f / 8.0f;
4379
4380							*(float*)(source0 + 4 * x) = c0;
4381						}
4382
4383						source0 += pitch;
4384						source1 += pitch;
4385						source2 += pitch;
4386						source3 += pitch;
4387						source4 += pitch;
4388						source5 += pitch;
4389						source6 += pitch;
4390						source7 += pitch;
4391					}
4392				}
4393				else if(internal.depth == 16)
4394				{
4395					for(int y = 0; y < height; y++)
4396					{
4397						for(int x = 0; x < 2 * width; x++)
4398						{
4399							float c0 = *(float*)(source0 + 4 * x);
4400							float c1 = *(float*)(source1 + 4 * x);
4401							float c2 = *(float*)(source2 + 4 * x);
4402							float c3 = *(float*)(source3 + 4 * x);
4403							float c4 = *(float*)(source4 + 4 * x);
4404							float c5 = *(float*)(source5 + 4 * x);
4405							float c6 = *(float*)(source6 + 4 * x);
4406							float c7 = *(float*)(source7 + 4 * x);
4407							float c8 = *(float*)(source8 + 4 * x);
4408							float c9 = *(float*)(source9 + 4 * x);
4409							float cA = *(float*)(sourceA + 4 * x);
4410							float cB = *(float*)(sourceB + 4 * x);
4411							float cC = *(float*)(sourceC + 4 * x);
4412							float cD = *(float*)(sourceD + 4 * x);
4413							float cE = *(float*)(sourceE + 4 * x);
4414							float cF = *(float*)(sourceF + 4 * x);
4415
4416							c0 = c0 + c1;
4417							c2 = c2 + c3;
4418							c4 = c4 + c5;
4419							c6 = c6 + c7;
4420							c8 = c8 + c9;
4421							cA = cA + cB;
4422							cC = cC + cD;
4423							cE = cE + cF;
4424							c0 = c0 + c2;
4425							c4 = c4 + c6;
4426							c8 = c8 + cA;
4427							cC = cC + cE;
4428							c0 = c0 + c4;
4429							c8 = c8 + cC;
4430							c0 = c0 + c8;
4431							c0 *= 1.0f / 16.0f;
4432
4433							*(float*)(source0 + 4 * x) = c0;
4434						}
4435
4436						source0 += pitch;
4437						source1 += pitch;
4438						source2 += pitch;
4439						source3 += pitch;
4440						source4 += pitch;
4441						source5 += pitch;
4442						source6 += pitch;
4443						source7 += pitch;
4444						source8 += pitch;
4445						source9 += pitch;
4446						sourceA += pitch;
4447						sourceB += pitch;
4448						sourceC += pitch;
4449						sourceD += pitch;
4450						sourceE += pitch;
4451						sourceF += pitch;
4452					}
4453				}
4454				else ASSERT(false);
4455			}
4456		}
4457		else if(internal.format == FORMAT_A32B32G32R32F)
4458		{
4459			if(CPUID::supportsSSE())
4460			{
4461				if(internal.depth == 2)
4462				{
4463					for(int y = 0; y < height; y++)
4464					{
4465						for(int x = 0; x < width; x++)
4466						{
4467							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
4468							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
4469
4470							c0 = _mm_add_ps(c0, c1);
4471							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
4472
4473							_mm_store_ps((float*)(source0 + 16 * x), c0);
4474						}
4475
4476						source0 += pitch;
4477						source1 += pitch;
4478					}
4479				}
4480				else if(internal.depth == 4)
4481				{
4482					for(int y = 0; y < height; y++)
4483					{
4484						for(int x = 0; x < width; x++)
4485						{
4486							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
4487							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
4488							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
4489							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
4490
4491							c0 = _mm_add_ps(c0, c1);
4492							c2 = _mm_add_ps(c2, c3);
4493							c0 = _mm_add_ps(c0, c2);
4494							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
4495
4496							_mm_store_ps((float*)(source0 + 16 * x), c0);
4497						}
4498
4499						source0 += pitch;
4500						source1 += pitch;
4501						source2 += pitch;
4502						source3 += pitch;
4503					}
4504				}
4505				else if(internal.depth == 8)
4506				{
4507					for(int y = 0; y < height; y++)
4508					{
4509						for(int x = 0; x < width; x++)
4510						{
4511							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
4512							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
4513							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
4514							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
4515							__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
4516							__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
4517							__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
4518							__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
4519
4520							c0 = _mm_add_ps(c0, c1);
4521							c2 = _mm_add_ps(c2, c3);
4522							c4 = _mm_add_ps(c4, c5);
4523							c6 = _mm_add_ps(c6, c7);
4524							c0 = _mm_add_ps(c0, c2);
4525							c4 = _mm_add_ps(c4, c6);
4526							c0 = _mm_add_ps(c0, c4);
4527							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
4528
4529							_mm_store_ps((float*)(source0 + 16 * x), c0);
4530						}
4531
4532						source0 += pitch;
4533						source1 += pitch;
4534						source2 += pitch;
4535						source3 += pitch;
4536						source4 += pitch;
4537						source5 += pitch;
4538						source6 += pitch;
4539						source7 += pitch;
4540					}
4541				}
4542				else if(internal.depth == 16)
4543				{
4544					for(int y = 0; y < height; y++)
4545					{
4546						for(int x = 0; x < width; x++)
4547						{
4548							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
4549							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
4550							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
4551							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
4552							__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
4553							__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
4554							__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
4555							__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
4556							__m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
4557							__m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
4558							__m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
4559							__m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
4560							__m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
4561							__m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
4562							__m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
4563							__m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
4564
4565							c0 = _mm_add_ps(c0, c1);
4566							c2 = _mm_add_ps(c2, c3);
4567							c4 = _mm_add_ps(c4, c5);
4568							c6 = _mm_add_ps(c6, c7);
4569							c8 = _mm_add_ps(c8, c9);
4570							cA = _mm_add_ps(cA, cB);
4571							cC = _mm_add_ps(cC, cD);
4572							cE = _mm_add_ps(cE, cF);
4573							c0 = _mm_add_ps(c0, c2);
4574							c4 = _mm_add_ps(c4, c6);
4575							c8 = _mm_add_ps(c8, cA);
4576							cC = _mm_add_ps(cC, cE);
4577							c0 = _mm_add_ps(c0, c4);
4578							c8 = _mm_add_ps(c8, cC);
4579							c0 = _mm_add_ps(c0, c8);
4580							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
4581
4582							_mm_store_ps((float*)(source0 + 16 * x), c0);
4583						}
4584
4585						source0 += pitch;
4586						source1 += pitch;
4587						source2 += pitch;
4588						source3 += pitch;
4589						source4 += pitch;
4590						source5 += pitch;
4591						source6 += pitch;
4592						source7 += pitch;
4593						source8 += pitch;
4594						source9 += pitch;
4595						sourceA += pitch;
4596						sourceB += pitch;
4597						sourceC += pitch;
4598						sourceD += pitch;
4599						sourceE += pitch;
4600						sourceF += pitch;
4601					}
4602				}
4603				else ASSERT(false);
4604			}
4605			else
4606			{
4607				if(internal.depth == 2)
4608				{
4609					for(int y = 0; y < height; y++)
4610					{
4611						for(int x = 0; x < 4 * width; x++)
4612						{
4613							float c0 = *(float*)(source0 + 4 * x);
4614							float c1 = *(float*)(source1 + 4 * x);
4615
4616							c0 = c0 + c1;
4617							c0 *= 1.0f / 2.0f;
4618
4619							*(float*)(source0 + 4 * x) = c0;
4620						}
4621
4622						source0 += pitch;
4623						source1 += pitch;
4624					}
4625				}
4626				else if(internal.depth == 4)
4627				{
4628					for(int y = 0; y < height; y++)
4629					{
4630						for(int x = 0; x < 4 * width; x++)
4631						{
4632							float c0 = *(float*)(source0 + 4 * x);
4633							float c1 = *(float*)(source1 + 4 * x);
4634							float c2 = *(float*)(source2 + 4 * x);
4635							float c3 = *(float*)(source3 + 4 * x);
4636
4637							c0 = c0 + c1;
4638							c2 = c2 + c3;
4639							c0 = c0 + c2;
4640							c0 *= 1.0f / 4.0f;
4641
4642							*(float*)(source0 + 4 * x) = c0;
4643						}
4644
4645						source0 += pitch;
4646						source1 += pitch;
4647						source2 += pitch;
4648						source3 += pitch;
4649					}
4650				}
4651				else if(internal.depth == 8)
4652				{
4653					for(int y = 0; y < height; y++)
4654					{
4655						for(int x = 0; x < 4 * width; x++)
4656						{
4657							float c0 = *(float*)(source0 + 4 * x);
4658							float c1 = *(float*)(source1 + 4 * x);
4659							float c2 = *(float*)(source2 + 4 * x);
4660							float c3 = *(float*)(source3 + 4 * x);
4661							float c4 = *(float*)(source4 + 4 * x);
4662							float c5 = *(float*)(source5 + 4 * x);
4663							float c6 = *(float*)(source6 + 4 * x);
4664							float c7 = *(float*)(source7 + 4 * x);
4665
4666							c0 = c0 + c1;
4667							c2 = c2 + c3;
4668							c4 = c4 + c5;
4669							c6 = c6 + c7;
4670							c0 = c0 + c2;
4671							c4 = c4 + c6;
4672							c0 = c0 + c4;
4673							c0 *= 1.0f / 8.0f;
4674
4675							*(float*)(source0 + 4 * x) = c0;
4676						}
4677
4678						source0 += pitch;
4679						source1 += pitch;
4680						source2 += pitch;
4681						source3 += pitch;
4682						source4 += pitch;
4683						source5 += pitch;
4684						source6 += pitch;
4685						source7 += pitch;
4686					}
4687				}
4688				else if(internal.depth == 16)
4689				{
4690					for(int y = 0; y < height; y++)
4691					{
4692						for(int x = 0; x < 4 * width; x++)
4693						{
4694							float c0 = *(float*)(source0 + 4 * x);
4695							float c1 = *(float*)(source1 + 4 * x);
4696							float c2 = *(float*)(source2 + 4 * x);
4697							float c3 = *(float*)(source3 + 4 * x);
4698							float c4 = *(float*)(source4 + 4 * x);
4699							float c5 = *(float*)(source5 + 4 * x);
4700							float c6 = *(float*)(source6 + 4 * x);
4701							float c7 = *(float*)(source7 + 4 * x);
4702							float c8 = *(float*)(source8 + 4 * x);
4703							float c9 = *(float*)(source9 + 4 * x);
4704							float cA = *(float*)(sourceA + 4 * x);
4705							float cB = *(float*)(sourceB + 4 * x);
4706							float cC = *(float*)(sourceC + 4 * x);
4707							float cD = *(float*)(sourceD + 4 * x);
4708							float cE = *(float*)(sourceE + 4 * x);
4709							float cF = *(float*)(sourceF + 4 * x);
4710
4711							c0 = c0 + c1;
4712							c2 = c2 + c3;
4713							c4 = c4 + c5;
4714							c6 = c6 + c7;
4715							c8 = c8 + c9;
4716							cA = cA + cB;
4717							cC = cC + cD;
4718							cE = cE + cF;
4719							c0 = c0 + c2;
4720							c4 = c4 + c6;
4721							c8 = c8 + cA;
4722							cC = cC + cE;
4723							c0 = c0 + c4;
4724							c8 = c8 + cC;
4725							c0 = c0 + c8;
4726							c0 *= 1.0f / 16.0f;
4727
4728							*(float*)(source0 + 4 * x) = c0;
4729						}
4730
4731						source0 += pitch;
4732						source1 += pitch;
4733						source2 += pitch;
4734						source3 += pitch;
4735						source4 += pitch;
4736						source5 += pitch;
4737						source6 += pitch;
4738						source7 += pitch;
4739						source8 += pitch;
4740						source9 += pitch;
4741						sourceA += pitch;
4742						sourceB += pitch;
4743						sourceC += pitch;
4744						sourceD += pitch;
4745						sourceE += pitch;
4746						sourceF += pitch;
4747					}
4748				}
4749				else ASSERT(false);
4750			}
4751		}
4752		else
4753		{
4754		//	UNIMPLEMENTED();
4755		}
4756	}
4757}
4758