Surface.cpp revision 75b650f0e501750ae0ba66a435741731905dffc1
1// SwiftShader Software Renderer
2//
3// Copyright(c) 2005-2013 TransGaming Inc.
4//
5// All rights reserved. No part of this software may be copied, distributed, transmitted,
6// transcribed, stored in a retrieval system, translated into any human or computer
7// language by any means, or disclosed to third parties without the explicit written
8// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
9// or implied, including but not limited to any patent rights, are granted to you.
10//
11
12#include "Surface.hpp"
13
14#include "Color.hpp"
15#include "Context.hpp"
16#include "ETC_Decoder.hpp"
17#include "Renderer.hpp"
18#include "Common/Half.hpp"
19#include "Common/Memory.hpp"
20#include "Common/CPUID.hpp"
21#include "Common/Resource.hpp"
22#include "Common/Debug.hpp"
23#include "Reactor/Reactor.hpp"
24
25#include <xmmintrin.h>
26#include <emmintrin.h>
27
28#undef min
29#undef max
30
31namespace sw
32{
33	extern bool quadLayoutEnabled;
34	extern bool complementaryDepthBuffer;
35	extern TranscendentalPrecision logPrecision;
36
37	unsigned int *Surface::palette = 0;
38	unsigned int Surface::paletteID = 0;
39
40	void Rect::clip(int minX, int minY, int maxX, int maxY)
41	{
42		x0 = clamp(x0, minX, maxX);
43		y0 = clamp(y0, minY, maxY);
44		x1 = clamp(x1, minX, maxX);
45		y1 = clamp(y1, minY, maxY);
46	}
47
48	void Surface::Buffer::write(int x, int y, int z, const Color<float> &color)
49	{
50		void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
51
52		write(element, color);
53	}
54
55	void Surface::Buffer::write(int x, int y, const Color<float> &color)
56	{
57		void *element = (unsigned char*)buffer + x * bytes + y * pitchB;
58
59		write(element, color);
60	}
61
62	inline void Surface::Buffer::write(void *element, const Color<float> &color)
63	{
64		switch(format)
65		{
66		case FORMAT_A8:
67			*(unsigned char*)element = unorm<8>(color.a);
68			break;
69		case FORMAT_R8I_SNORM:
70			*(char*)element = snorm<8>(color.r);
71			break;
72		case FORMAT_R8:
73			*(unsigned char*)element = unorm<8>(color.r);
74			break;
75		case FORMAT_R8I:
76			*(char*)element = scast<8>(color.r);
77			break;
78		case FORMAT_R8UI:
79			*(unsigned char*)element = ucast<8>(color.r);
80			break;
81		case FORMAT_R16I:
82			*(short*)element = scast<16>(color.r);
83			break;
84		case FORMAT_R16UI:
85			*(unsigned short*)element = ucast<16>(color.r);
86			break;
87		case FORMAT_R32I:
88			*(int*)element = static_cast<int>(color.r);
89			break;
90		case FORMAT_R32UI:
91			*(unsigned int*)element = static_cast<unsigned int>(color.r);
92			break;
93		case FORMAT_R3G3B2:
94			*(unsigned char*)element = (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
95			break;
96		case FORMAT_A8R3G3B2:
97			*(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
98			break;
99		case FORMAT_X4R4G4B4:
100			*(unsigned short*)element = 0xF000 | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
101			break;
102		case FORMAT_A4R4G4B4:
103			*(unsigned short*)element = (unorm<4>(color.a) << 12) | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
104			break;
105		case FORMAT_R4G4B4A4:
106			*(unsigned short*)element = (unorm<4>(color.r) << 12) | (unorm<4>(color.g) << 8) | (unorm<4>(color.b) << 4) | (unorm<4>(color.a) << 0);
107			break;
108		case FORMAT_R5G6B5:
109			*(unsigned short*)element = (unorm<5>(color.r) << 11) | (unorm<6>(color.g) << 5) | (unorm<5>(color.b) << 0);
110			break;
111		case FORMAT_A1R5G5B5:
112			*(unsigned short*)element = (unorm<1>(color.a) << 15) | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
113			break;
114		case FORMAT_R5G5B5A1:
115			*(unsigned short*)element = (unorm<5>(color.r) << 11) | (unorm<5>(color.g) << 6) | (unorm<5>(color.b) << 1) | (unorm<5>(color.a) << 0);
116			break;
117		case FORMAT_X1R5G5B5:
118			*(unsigned short*)element = 0x8000 | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
119			break;
120		case FORMAT_A8R8G8B8:
121			*(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
122			break;
123		case FORMAT_X8R8G8B8:
124			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
125			break;
126		case FORMAT_A8B8G8R8I_SNORM:
127			*(unsigned int*)element = (static_cast<unsigned int>(snorm<8>(color.a)) << 24) |
128			                          (static_cast<unsigned int>(snorm<8>(color.b)) << 16) |
129			                          (static_cast<unsigned int>(snorm<8>(color.g)) << 8) |
130			                          (static_cast<unsigned int>(snorm<8>(color.r)) << 0);
131			break;
132		case FORMAT_A8B8G8R8:
133			*(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
134			break;
135		case FORMAT_A8B8G8R8I:
136			*(unsigned int*)element = (static_cast<unsigned int>(scast<8>(color.a)) << 24) |
137			                          (static_cast<unsigned int>(scast<8>(color.b)) << 16) |
138			                          (static_cast<unsigned int>(scast<8>(color.g)) << 8) |
139			                          (static_cast<unsigned int>(scast<8>(color.r)) << 0);
140			break;
141		case FORMAT_A8B8G8R8UI:
142			*(unsigned int*)element = (ucast<8>(color.a) << 24) | (ucast<8>(color.b) << 16) | (ucast<8>(color.g) << 8) | (ucast<8>(color.r) << 0);
143			break;
144		case FORMAT_X8B8G8R8I_SNORM:
145			*(unsigned int*)element = 0x7F000000 |
146			                          (static_cast<unsigned int>(snorm<8>(color.b)) << 16) |
147			                          (static_cast<unsigned int>(snorm<8>(color.g)) << 8) |
148			                          (static_cast<unsigned int>(snorm<8>(color.r)) << 0);
149			break;
150		case FORMAT_X8B8G8R8:
151			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
152			break;
153		case FORMAT_X8B8G8R8I:
154			*(unsigned int*)element = 0x7F000000 |
155			                          (static_cast<unsigned int>(scast<8>(color.b)) << 16) |
156			                          (static_cast<unsigned int>(scast<8>(color.g)) << 8) |
157			                          (static_cast<unsigned int>(scast<8>(color.r)) << 0);
158		case FORMAT_X8B8G8R8UI:
159			*(unsigned int*)element = 0xFF000000 | (ucast<8>(color.b) << 16) | (ucast<8>(color.g) << 8) | (ucast<8>(color.r) << 0);
160			break;
161		case FORMAT_A2R10G10B10:
162			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.r) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.b) << 0);
163			break;
164		case FORMAT_A2B10G10R10:
165			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.b) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.r) << 0);
166			break;
167		case FORMAT_G8R8I_SNORM:
168			*(unsigned short*)element = (static_cast<unsigned short>(snorm<8>(color.g)) << 8) |
169			                            (static_cast<unsigned short>(snorm<8>(color.r)) << 0);
170			break;
171		case FORMAT_G8R8:
172			*(unsigned short*)element = (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
173			break;
174		case FORMAT_G8R8I:
175			*(unsigned short*)element = (static_cast<unsigned short>(scast<8>(color.g)) << 8) |
176			                            (static_cast<unsigned short>(scast<8>(color.r)) << 0);
177			break;
178		case FORMAT_G8R8UI:
179			*(unsigned short*)element = (ucast<8>(color.g) << 8) | (ucast<8>(color.r) << 0);
180			break;
181		case FORMAT_G16R16:
182			*(unsigned int*)element = (unorm<16>(color.g) << 16) | (unorm<16>(color.r) << 0);
183			break;
184		case FORMAT_G16R16I:
185			*(unsigned int*)element = (static_cast<unsigned int>(scast<16>(color.g)) << 16) |
186			                          (static_cast<unsigned int>(scast<16>(color.r)) << 0);
187			break;
188		case FORMAT_G16R16UI:
189			*(unsigned int*)element = (ucast<16>(color.g) << 16) | (ucast<16>(color.r) << 0);
190			break;
191		case FORMAT_G32R32I:
192		case FORMAT_G32R32UI:
193			((unsigned int*)element)[0] = static_cast<unsigned int>(color.r);
194			((unsigned int*)element)[1] = static_cast<unsigned int>(color.g);
195			break;
196		case FORMAT_A16B16G16R16:
197			((unsigned short*)element)[0] = unorm<16>(color.r);
198			((unsigned short*)element)[1] = unorm<16>(color.g);
199			((unsigned short*)element)[2] = unorm<16>(color.b);
200			((unsigned short*)element)[3] = unorm<16>(color.a);
201			break;
202		case FORMAT_A16B16G16R16I:
203			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(color.r));
204			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(color.g));
205			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(color.b));
206			((unsigned short*)element)[3] = static_cast<unsigned short>(scast<16>(color.a));
207			break;
208		case FORMAT_A16B16G16R16UI:
209			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(color.r));
210			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(color.g));
211			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(color.b));
212			((unsigned short*)element)[3] = static_cast<unsigned short>(ucast<16>(color.a));
213			break;
214		case FORMAT_X16B16G16R16I:
215			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(color.r));
216			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(color.g));
217			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(color.b));
218			break;
219		case FORMAT_X16B16G16R16UI:
220			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(color.r));
221			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(color.g));
222			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(color.b));
223			break;
224		case FORMAT_A32B32G32R32I:
225		case FORMAT_A32B32G32R32UI:
226			((unsigned int*)element)[0] = static_cast<unsigned int>(color.r);
227			((unsigned int*)element)[1] = static_cast<unsigned int>(color.g);
228			((unsigned int*)element)[2] = static_cast<unsigned int>(color.b);
229			((unsigned int*)element)[3] = static_cast<unsigned int>(color.a);
230			break;
231		case FORMAT_X32B32G32R32I:
232		case FORMAT_X32B32G32R32UI:
233			((unsigned int*)element)[0] = static_cast<unsigned int>(color.r);
234			((unsigned int*)element)[1] = static_cast<unsigned int>(color.g);
235			((unsigned int*)element)[2] = static_cast<unsigned int>(color.b);
236			break;
237		case FORMAT_V8U8:
238			*(unsigned short*)element = (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
239			break;
240		case FORMAT_L6V5U5:
241			*(unsigned short*)element = (unorm<6>(color.b) << 10) | (snorm<5>(color.g) << 5) | (snorm<5>(color.r) << 0);
242			break;
243		case FORMAT_Q8W8V8U8:
244			*(unsigned int*)element = (snorm<8>(color.a) << 24) | (snorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
245			break;
246		case FORMAT_X8L8V8U8:
247			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
248			break;
249		case FORMAT_V16U16:
250			*(unsigned int*)element = (snorm<16>(color.g) << 16) | (snorm<16>(color.r) << 0);
251			break;
252		case FORMAT_A2W10V10U10:
253			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (snorm<10>(color.b) << 20) | (snorm<10>(color.g) << 10) | (snorm<10>(color.r) << 0);
254			break;
255		case FORMAT_A16W16V16U16:
256			((unsigned short*)element)[0] = snorm<16>(color.r);
257			((unsigned short*)element)[1] = snorm<16>(color.g);
258			((unsigned short*)element)[2] = snorm<16>(color.b);
259			((unsigned short*)element)[3] = unorm<16>(color.a);
260			break;
261		case FORMAT_Q16W16V16U16:
262			((unsigned short*)element)[0] = snorm<16>(color.r);
263			((unsigned short*)element)[1] = snorm<16>(color.g);
264			((unsigned short*)element)[2] = snorm<16>(color.b);
265			((unsigned short*)element)[3] = snorm<16>(color.a);
266			break;
267		case FORMAT_R8G8B8:
268			((unsigned char*)element)[0] = unorm<8>(color.b);
269			((unsigned char*)element)[1] = unorm<8>(color.g);
270			((unsigned char*)element)[2] = unorm<8>(color.r);
271			break;
272		case FORMAT_B8G8R8:
273			((unsigned char*)element)[0] = unorm<8>(color.r);
274			((unsigned char*)element)[1] = unorm<8>(color.g);
275			((unsigned char*)element)[2] = unorm<8>(color.b);
276			break;
277		case FORMAT_R16F:
278			*(half*)element = (half)color.r;
279			break;
280		case FORMAT_A16F:
281			*(half*)element = (half)color.a;
282			break;
283		case FORMAT_G16R16F:
284			((half*)element)[0] = (half)color.r;
285			((half*)element)[1] = (half)color.g;
286			break;
287		case FORMAT_B16G16R16F:
288			((half*)element)[0] = (half)color.r;
289			((half*)element)[1] = (half)color.g;
290			((half*)element)[2] = (half)color.b;
291			break;
292		case FORMAT_A16B16G16R16F:
293			((half*)element)[0] = (half)color.r;
294			((half*)element)[1] = (half)color.g;
295			((half*)element)[2] = (half)color.b;
296			((half*)element)[3] = (half)color.a;
297			break;
298		case FORMAT_A32F:
299			*(float*)element = color.a;
300			break;
301		case FORMAT_R32F:
302			*(float*)element = color.r;
303			break;
304		case FORMAT_G32R32F:
305			((float*)element)[0] = color.r;
306			((float*)element)[1] = color.g;
307			break;
308		case FORMAT_B32G32R32F:
309			((float*)element)[0] = color.r;
310			((float*)element)[1] = color.g;
311			((float*)element)[2] = color.b;
312			break;
313		case FORMAT_A32B32G32R32F:
314			((float*)element)[0] = color.r;
315			((float*)element)[1] = color.g;
316			((float*)element)[2] = color.b;
317			((float*)element)[3] = color.a;
318			break;
319		case FORMAT_D32F:
320		case FORMAT_D32F_LOCKABLE:
321		case FORMAT_D32FS8_TEXTURE:
322		case FORMAT_D32FS8_SHADOW:
323			*((float*)element) = color.r;
324			break;
325		case FORMAT_D32F_COMPLEMENTARY:
326			*((float*)element) = 1 - color.r;
327			break;
328		case FORMAT_S8:
329			*((unsigned char*)element) = unorm<8>(color.r);
330			break;
331		case FORMAT_L8:
332			*(unsigned char*)element = unorm<8>(color.r);
333			break;
334		case FORMAT_A4L4:
335			*(unsigned char*)element = (unorm<4>(color.a) << 4) | (unorm<4>(color.r) << 0);
336			break;
337		case FORMAT_L16:
338			*(unsigned short*)element = unorm<16>(color.r);
339			break;
340		case FORMAT_A8L8:
341			*(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<8>(color.r) << 0);
342			break;
343		case FORMAT_L16F:
344			*(half*)element = (half)color.r;
345			break;
346		case FORMAT_A16L16F:
347			((half*)element)[0] = (half)color.r;
348			((half*)element)[1] = (half)color.a;
349			break;
350		case FORMAT_L32F:
351			*(float*)element = color.r;
352			break;
353		case FORMAT_A32L32F:
354			((float*)element)[0] = color.r;
355			((float*)element)[1] = color.a;
356			break;
357		default:
358			ASSERT(false);
359		}
360	}
361
362	Color<float> Surface::Buffer::read(int x, int y, int z) const
363	{
364		void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
365
366		return read(element);
367	}
368
369	Color<float> Surface::Buffer::read(int x, int y) const
370	{
371		void *element = (unsigned char*)buffer + x * bytes + y * pitchB;
372
373		return read(element);
374	}
375
376	inline Color<float> Surface::Buffer::read(void *element) const
377	{
378		float r = 0.0f;
379		float g = 0.0f;
380		float b = 0.0f;
381		float a = 1.0f;
382
383		switch(format)
384		{
385		case FORMAT_P8:
386			{
387				ASSERT(palette);
388
389				unsigned int abgr = palette[*(unsigned char*)element];
390
391				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
392				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
393				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
394				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
395			}
396			break;
397		case FORMAT_A8P8:
398			{
399				ASSERT(palette);
400
401				unsigned int bgr = palette[((unsigned char*)element)[0]];
402
403				r = (bgr & 0x000000FF) * (1.0f / 0x000000FF);
404				g = (bgr & 0x0000FF00) * (1.0f / 0x0000FF00);
405				b = (bgr & 0x00FF0000) * (1.0f / 0x00FF0000);
406				a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
407			}
408			break;
409		case FORMAT_A8:
410			r = 0;
411			g = 0;
412			b = 0;
413			a = *(unsigned char*)element * (1.0f / 0xFF);
414			break;
415		case FORMAT_R8I_SNORM:
416			r = max((*(signed char*)element) * (1.0f / 0x7F), -1.0f);
417			break;
418		case FORMAT_R8:
419			r = *(unsigned char*)element * (1.0f / 0xFF);
420			break;
421		case FORMAT_R8I:
422			r = *(signed char*)element;
423			break;
424		case FORMAT_R8UI:
425			r = *(unsigned char*)element;
426			break;
427		case FORMAT_R3G3B2:
428			{
429				unsigned char rgb = *(unsigned char*)element;
430
431				r = (rgb & 0xE0) * (1.0f / 0xE0);
432				g = (rgb & 0x1C) * (1.0f / 0x1C);
433				b = (rgb & 0x03) * (1.0f / 0x03);
434			}
435			break;
436		case FORMAT_A8R3G3B2:
437			{
438				unsigned short argb = *(unsigned short*)element;
439
440				a = (argb & 0xFF00) * (1.0f / 0xFF00);
441				r = (argb & 0x00E0) * (1.0f / 0x00E0);
442				g = (argb & 0x001C) * (1.0f / 0x001C);
443				b = (argb & 0x0003) * (1.0f / 0x0003);
444			}
445			break;
446		case FORMAT_X4R4G4B4:
447			{
448				unsigned short rgb = *(unsigned short*)element;
449
450				r = (rgb & 0x0F00) * (1.0f / 0x0F00);
451				g = (rgb & 0x00F0) * (1.0f / 0x00F0);
452				b = (rgb & 0x000F) * (1.0f / 0x000F);
453			}
454			break;
455		case FORMAT_A4R4G4B4:
456			{
457				unsigned short argb = *(unsigned short*)element;
458
459				a = (argb & 0xF000) * (1.0f / 0xF000);
460				r = (argb & 0x0F00) * (1.0f / 0x0F00);
461				g = (argb & 0x00F0) * (1.0f / 0x00F0);
462				b = (argb & 0x000F) * (1.0f / 0x000F);
463			}
464			break;
465		case FORMAT_R4G4B4A4:
466			{
467				unsigned short rgba = *(unsigned short*)element;
468
469				r = (rgba & 0xF000) * (1.0f / 0xF000);
470				g = (rgba & 0x0F00) * (1.0f / 0x0F00);
471				b = (rgba & 0x00F0) * (1.0f / 0x00F0);
472				a = (rgba & 0x000F) * (1.0f / 0x000F);
473			}
474			break;
475		case FORMAT_R5G6B5:
476			{
477				unsigned short rgb = *(unsigned short*)element;
478
479				r = (rgb & 0xF800) * (1.0f / 0xF800);
480				g = (rgb & 0x07E0) * (1.0f / 0x07E0);
481				b = (rgb & 0x001F) * (1.0f / 0x001F);
482			}
483			break;
484		case FORMAT_A1R5G5B5:
485			{
486				unsigned short argb = *(unsigned short*)element;
487
488				a = (argb & 0x8000) * (1.0f / 0x8000);
489				r = (argb & 0x7C00) * (1.0f / 0x7C00);
490				g = (argb & 0x03E0) * (1.0f / 0x03E0);
491				b = (argb & 0x001F) * (1.0f / 0x001F);
492			}
493			break;
494		case FORMAT_R5G5B5A1:
495			{
496				unsigned short rgba = *(unsigned short*)element;
497
498				r = (rgba & 0xF800) * (1.0f / 0xF800);
499				g = (rgba & 0x07C0) * (1.0f / 0x07C0);
500				b = (rgba & 0x003E) * (1.0f / 0x003E);
501				a = (rgba & 0x0001) * (1.0f / 0x0001);
502			}
503			break;
504		case FORMAT_X1R5G5B5:
505			{
506				unsigned short xrgb = *(unsigned short*)element;
507
508				r = (xrgb & 0x7C00) * (1.0f / 0x7C00);
509				g = (xrgb & 0x03E0) * (1.0f / 0x03E0);
510				b = (xrgb & 0x001F) * (1.0f / 0x001F);
511			}
512			break;
513		case FORMAT_A8R8G8B8:
514			{
515				unsigned int argb = *(unsigned int*)element;
516
517				a = (argb & 0xFF000000) * (1.0f / 0xFF000000);
518				r = (argb & 0x00FF0000) * (1.0f / 0x00FF0000);
519				g = (argb & 0x0000FF00) * (1.0f / 0x0000FF00);
520				b = (argb & 0x000000FF) * (1.0f / 0x000000FF);
521			}
522			break;
523		case FORMAT_X8R8G8B8:
524			{
525				unsigned int xrgb = *(unsigned int*)element;
526
527				r = (xrgb & 0x00FF0000) * (1.0f / 0x00FF0000);
528				g = (xrgb & 0x0000FF00) * (1.0f / 0x0000FF00);
529				b = (xrgb & 0x000000FF) * (1.0f / 0x000000FF);
530			}
531			break;
532		case FORMAT_A8B8G8R8I_SNORM:
533			{
534				signed char* abgr = (signed char*)element;
535
536				r = max(abgr[0] * (1.0f / 0x7F), -1.0f);
537				g = max(abgr[1] * (1.0f / 0x7F), -1.0f);
538				b = max(abgr[2] * (1.0f / 0x7F), -1.0f);
539				a = max(abgr[3] * (1.0f / 0x7F), -1.0f);
540			}
541			break;
542		case FORMAT_A8B8G8R8:
543			{
544				unsigned int abgr = *(unsigned int*)element;
545
546				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
547				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
548				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
549				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
550			}
551			break;
552		case FORMAT_A8B8G8R8I:
553			{
554				signed char* abgr = (signed char*)element;
555
556				r = abgr[0];
557				g = abgr[1];
558				b = abgr[2];
559				a = abgr[3];
560			}
561			break;
562		case FORMAT_A8B8G8R8UI:
563			{
564				unsigned char* abgr = (unsigned char*)element;
565
566				r = abgr[0];
567				g = abgr[1];
568				b = abgr[2];
569				a = abgr[3];
570			}
571			break;
572		case FORMAT_X8B8G8R8I_SNORM:
573			{
574				signed char* bgr = (signed char*)element;
575
576				r = max(bgr[0] * (1.0f / 0x7F), -1.0f);
577				g = max(bgr[1] * (1.0f / 0x7F), -1.0f);
578				b = max(bgr[2] * (1.0f / 0x7F), -1.0f);
579			}
580			break;
581		case FORMAT_X8B8G8R8:
582			{
583				unsigned int xbgr = *(unsigned int*)element;
584
585				b = (xbgr & 0x00FF0000) * (1.0f / 0x00FF0000);
586				g = (xbgr & 0x0000FF00) * (1.0f / 0x0000FF00);
587				r = (xbgr & 0x000000FF) * (1.0f / 0x000000FF);
588			}
589			break;
590		case FORMAT_X8B8G8R8I:
591			{
592				signed char* bgr = (signed char*)element;
593
594				r = bgr[0];
595				g = bgr[1];
596				b = bgr[2];
597			}
598			break;
599		case FORMAT_X8B8G8R8UI:
600			{
601				unsigned char* bgr = (unsigned char*)element;
602
603				r = bgr[0];
604				g = bgr[1];
605				b = bgr[2];
606			}
607			break;
608		case FORMAT_G8R8I_SNORM:
609			{
610				signed char* gr = (signed char*)element;
611
612				r = (gr[0] & 0xFF00) * (1.0f / 0xFF00);
613				g = (gr[1] & 0x00FF) * (1.0f / 0x00FF);
614			}
615			break;
616		case FORMAT_G8R8:
617			{
618				unsigned short gr = *(unsigned short*)element;
619
620				g = (gr & 0xFF00) * (1.0f / 0xFF00);
621				r = (gr & 0x00FF) * (1.0f / 0x00FF);
622			}
623			break;
624		case FORMAT_G8R8I:
625			{
626				signed char* gr = (signed char*)element;
627
628				r = gr[0];
629				g = gr[1];
630			}
631			break;
632		case FORMAT_G8R8UI:
633			{
634				unsigned char* gr = (unsigned char*)element;
635
636				r = gr[0];
637				g = gr[1];
638			}
639			break;
640		case FORMAT_R16I:
641			r = *((short*)element);
642			break;
643		case FORMAT_R16UI:
644			r = *((unsigned short*)element);
645			break;
646		case FORMAT_G16R16I:
647			{
648				short* gr = (short*)element;
649
650				r = gr[0];
651				g = gr[1];
652			}
653			break;
654		case FORMAT_G16R16:
655			{
656				unsigned int gr = *(unsigned int*)element;
657
658				g = (gr & 0xFFFF0000) * (1.0f / 0xFFFF0000);
659				r = (gr & 0x0000FFFF) * (1.0f / 0x0000FFFF);
660			}
661			break;
662		case FORMAT_G16R16UI:
663			{
664				unsigned short* gr = (unsigned short*)element;
665
666				r = gr[0];
667				g = gr[1];
668			}
669			break;
670		case FORMAT_A2R10G10B10:
671			{
672				unsigned int argb = *(unsigned int*)element;
673
674				a = (argb & 0xC0000000) * (1.0f / 0xC0000000);
675				r = (argb & 0x3FF00000) * (1.0f / 0x3FF00000);
676				g = (argb & 0x000FFC00) * (1.0f / 0x000FFC00);
677				b = (argb & 0x000003FF) * (1.0f / 0x000003FF);
678			}
679			break;
680		case FORMAT_A2B10G10R10:
681			{
682				unsigned int abgr = *(unsigned int*)element;
683
684				a = (abgr & 0xC0000000) * (1.0f / 0xC0000000);
685				b = (abgr & 0x3FF00000) * (1.0f / 0x3FF00000);
686				g = (abgr & 0x000FFC00) * (1.0f / 0x000FFC00);
687				r = (abgr & 0x000003FF) * (1.0f / 0x000003FF);
688			}
689			break;
690		case FORMAT_A16B16G16R16I:
691			{
692				short* abgr = (short*)element;
693
694				r = abgr[0];
695				g = abgr[1];
696				b = abgr[2];
697				a = abgr[3];
698			}
699			break;
700		case FORMAT_A16B16G16R16:
701			r = ((unsigned short*)element)[0] * (1.0f / 0xFFFF);
702			g = ((unsigned short*)element)[1] * (1.0f / 0xFFFF);
703			b = ((unsigned short*)element)[2] * (1.0f / 0xFFFF);
704			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
705			break;
706		case FORMAT_A16B16G16R16UI:
707			{
708				unsigned short* abgr = (unsigned short*)element;
709
710				r = abgr[0];
711				g = abgr[1];
712				b = abgr[2];
713				a = abgr[3];
714			}
715			break;
716		case FORMAT_X16B16G16R16I:
717			{
718				short* bgr = (short*)element;
719
720				r = bgr[0];
721				g = bgr[1];
722				b = bgr[2];
723			}
724			break;
725		case FORMAT_X16B16G16R16UI:
726			{
727				unsigned short* bgr = (unsigned short*)element;
728
729				r = bgr[0];
730				g = bgr[1];
731				b = bgr[2];
732			}
733			break;
734		case FORMAT_A32B32G32R32I:
735			{
736				int* abgr = (int*)element;
737
738				r = static_cast<float>(abgr[0]);
739				g = static_cast<float>(abgr[1]);
740				b = static_cast<float>(abgr[2]);
741				a = static_cast<float>(abgr[3]);
742			}
743			break;
744		case FORMAT_A32B32G32R32UI:
745			{
746				unsigned int* abgr = (unsigned int*)element;
747
748				r = static_cast<float>(abgr[0]);
749				g = static_cast<float>(abgr[1]);
750				b = static_cast<float>(abgr[2]);
751				a = static_cast<float>(abgr[3]);
752			}
753			break;
754		case FORMAT_X32B32G32R32I:
755			{
756				int* bgr = (int*)element;
757
758				r = static_cast<float>(bgr[0]);
759				g = static_cast<float>(bgr[1]);
760				b = static_cast<float>(bgr[2]);
761			}
762			break;
763		case FORMAT_X32B32G32R32UI:
764			{
765				unsigned int* bgr = (unsigned int*)element;
766
767				r = static_cast<float>(bgr[0]);
768				g = static_cast<float>(bgr[1]);
769				b = static_cast<float>(bgr[2]);
770			}
771			break;
772		case FORMAT_G32R32I:
773			{
774				int* gr = (int*)element;
775
776				r = static_cast<float>(gr[0]);
777				g = static_cast<float>(gr[1]);
778			}
779			break;
780		case FORMAT_G32R32UI:
781			{
782				unsigned int* gr = (unsigned int*)element;
783
784				r = static_cast<float>(gr[0]);
785				g = static_cast<float>(gr[1]);
786			}
787			break;
788		case FORMAT_R32I:
789			r = static_cast<float>(*((int*)element));
790			break;
791		case FORMAT_R32UI:
792			r = static_cast<float>(*((unsigned int*)element));
793			break;
794		case FORMAT_V8U8:
795			{
796				unsigned short vu = *(unsigned short*)element;
797
798				r = ((int)(vu & 0x00FF) << 24) * (1.0f / 0x7F000000);
799				g = ((int)(vu & 0xFF00) << 16) * (1.0f / 0x7F000000);
800			}
801			break;
802		case FORMAT_L6V5U5:
803			{
804				unsigned short lvu = *(unsigned short*)element;
805
806				r = ((int)(lvu & 0x001F) << 27) * (1.0f / 0x78000000);
807				g = ((int)(lvu & 0x03E0) << 22) * (1.0f / 0x78000000);
808				b = (lvu & 0xFC00) * (1.0f / 0xFC00);
809			}
810			break;
811		case FORMAT_Q8W8V8U8:
812			{
813				unsigned int qwvu = *(unsigned int*)element;
814
815				r = ((int)(qwvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
816				g = ((int)(qwvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
817				b = ((int)(qwvu & 0x00FF0000) << 8)  * (1.0f / 0x7F000000);
818				a = ((int)(qwvu & 0xFF000000) << 0)  * (1.0f / 0x7F000000);
819			}
820			break;
821		case FORMAT_X8L8V8U8:
822			{
823				unsigned int xlvu = *(unsigned int*)element;
824
825				r = ((int)(xlvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
826				g = ((int)(xlvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
827				b = (xlvu & 0x00FF0000) * (1.0f / 0x00FF0000);
828			}
829			break;
830		case FORMAT_R8G8B8:
831			r = ((unsigned char*)element)[2] * (1.0f / 0xFF);
832			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
833			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
834			break;
835		case FORMAT_B8G8R8:
836			r = ((unsigned char*)element)[0] * (1.0f / 0xFF);
837			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
838			b = ((unsigned char*)element)[2] * (1.0f / 0xFF);
839			break;
840		case FORMAT_V16U16:
841			{
842				unsigned int vu = *(unsigned int*)element;
843
844				r = ((int)(vu & 0x0000FFFF) << 16) * (1.0f / 0x7FFF0000);
845				g = ((int)(vu & 0xFFFF0000) << 0)  * (1.0f / 0x7FFF0000);
846			}
847			break;
848		case FORMAT_A2W10V10U10:
849			{
850				unsigned int awvu = *(unsigned int*)element;
851
852				r = ((int)(awvu & 0x000003FF) << 22) * (1.0f / 0x7FC00000);
853				g = ((int)(awvu & 0x000FFC00) << 12) * (1.0f / 0x7FC00000);
854				b = ((int)(awvu & 0x3FF00000) << 2)  * (1.0f / 0x7FC00000);
855				a = (awvu & 0xC0000000) * (1.0f / 0xC0000000);
856			}
857			break;
858		case FORMAT_A16W16V16U16:
859			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
860			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
861			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
862			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
863			break;
864		case FORMAT_Q16W16V16U16:
865			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
866			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
867			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
868			a = ((signed short*)element)[3] * (1.0f / 0x7FFF);
869			break;
870		case FORMAT_L8:
871			r =
872			g =
873			b = *(unsigned char*)element * (1.0f / 0xFF);
874			break;
875		case FORMAT_A4L4:
876			{
877				unsigned char al = *(unsigned char*)element;
878
879				r =
880				g =
881				b = (al & 0x0F) * (1.0f / 0x0F);
882				a = (al & 0xF0) * (1.0f / 0xF0);
883			}
884			break;
885		case FORMAT_L16:
886			r =
887			g =
888			b = *(unsigned short*)element * (1.0f / 0xFFFF);
889			break;
890		case FORMAT_A8L8:
891			r =
892			g =
893			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
894			a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
895			break;
896		case FORMAT_L16F:
897			r =
898			g =
899			b = *(half*)element;
900			break;
901		case FORMAT_A16L16F:
902			r =
903			g =
904			b = ((half*)element)[0];
905			a = ((half*)element)[1];
906			break;
907		case FORMAT_L32F:
908			r =
909			g =
910			b = *(float*)element;
911			break;
912		case FORMAT_A32L32F:
913			r =
914			g =
915			b = ((float*)element)[0];
916			a = ((float*)element)[1];
917			break;
918		case FORMAT_A16F:
919			a = *(half*)element;
920			break;
921		case FORMAT_R16F:
922			r = *(half*)element;
923			break;
924		case FORMAT_G16R16F:
925			r = ((half*)element)[0];
926			g = ((half*)element)[1];
927			break;
928		case FORMAT_B16G16R16F:
929			r = ((half*)element)[0];
930			g = ((half*)element)[1];
931			b = ((half*)element)[2];
932			break;
933		case FORMAT_A16B16G16R16F:
934			r = ((half*)element)[0];
935			g = ((half*)element)[1];
936			b = ((half*)element)[2];
937			a = ((half*)element)[3];
938			break;
939		case FORMAT_A32F:
940			a = *(float*)element;
941			break;
942		case FORMAT_R32F:
943			r = *(float*)element;
944			break;
945		case FORMAT_G32R32F:
946			r = ((float*)element)[0];
947			g = ((float*)element)[1];
948			break;
949		case FORMAT_B32G32R32F:
950			r = ((float*)element)[0];
951			g = ((float*)element)[1];
952			b = ((float*)element)[2];
953			break;
954		case FORMAT_A32B32G32R32F:
955			r = ((float*)element)[0];
956			g = ((float*)element)[1];
957			b = ((float*)element)[2];
958			a = ((float*)element)[3];
959			break;
960		case FORMAT_D32F:
961		case FORMAT_D32F_LOCKABLE:
962		case FORMAT_D32FS8_TEXTURE:
963		case FORMAT_D32FS8_SHADOW:
964			r = *(float*)element;
965			g = r;
966			b = r;
967			a = r;
968			break;
969		case FORMAT_D32F_COMPLEMENTARY:
970			r = 1.0f - *(float*)element;
971			g = r;
972			b = r;
973			a = r;
974			break;
975		case FORMAT_S8:
976			r = *(unsigned char*)element * (1.0f / 0xFF);
977			break;
978		default:
979			ASSERT(false);
980		}
981
982	//	if(sRGB)
983	//	{
984	//		r = sRGBtoLinear(r);
985	//		g = sRGBtoLinear(g);
986	//		b = sRGBtoLinear(b);
987	//	}
988
989		return Color<float>(r, g, b, a);
990	}
991
992	Color<float> Surface::Buffer::sample(float x, float y, float z) const
993	{
994		x -= 0.5f;
995		y -= 0.5f;
996		z -= 0.5f;
997
998		int x0 = clamp((int)x, 0, width - 1);
999		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
1000
1001		int y0 = clamp((int)y, 0, height - 1);
1002		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
1003
1004		int z0 = clamp((int)z, 0, depth - 1);
1005		int z1 = (z0 + 1 >= depth) ? z0 : z0 + 1;
1006
1007		Color<float> c000 = read(x0, y0, z0);
1008		Color<float> c100 = read(x1, y0, z0);
1009		Color<float> c010 = read(x0, y1, z0);
1010		Color<float> c110 = read(x1, y1, z0);
1011		Color<float> c001 = read(x0, y0, z1);
1012		Color<float> c101 = read(x1, y0, z1);
1013		Color<float> c011 = read(x0, y1, z1);
1014		Color<float> c111 = read(x1, y1, z1);
1015
1016		float fx = x - x0;
1017		float fy = y - y0;
1018		float fz = z - z0;
1019
1020		c000 *= (1 - fx) * (1 - fy) * (1 - fz);
1021		c100 *= fx * (1 - fy) * (1 - fz);
1022		c010 *= (1 - fx) * fy * (1 - fz);
1023		c110 *= fx * fy * (1 - fz);
1024		c001 *= (1 - fx) * (1 - fy) * fz;
1025		c101 *= fx * (1 - fy) * fz;
1026		c011 *= (1 - fx) * fy * fz;
1027		c111 *= fx * fy * fz;
1028
1029		return c000 + c100 + c010 + c110 + c001 + c101 + c011 + c111;
1030	}
1031
1032	Color<float> Surface::Buffer::sample(float x, float y) const
1033	{
1034		x -= 0.5f;
1035		y -= 0.5f;
1036
1037		int x0 = clamp((int)x, 0, width - 1);
1038		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
1039
1040		int y0 = clamp((int)y, 0, height - 1);
1041		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
1042
1043		Color<float> c00 = read(x0, y0);
1044		Color<float> c10 = read(x1, y0);
1045		Color<float> c01 = read(x0, y1);
1046		Color<float> c11 = read(x1, y1);
1047
1048		float fx = x - x0;
1049		float fy = y - y0;
1050
1051		c00 *= (1 - fx) * (1 - fy);
1052		c10 *= fx * (1 - fy);
1053		c01 *= (1 - fx) * fy;
1054		c11 *= fx * fy;
1055
1056		return c00 + c10 + c01 + c11;
1057	}
1058
1059	void *Surface::Buffer::lockRect(int x, int y, int z, Lock lock)
1060	{
1061		this->lock = lock;
1062
1063		switch(lock)
1064		{
1065		case LOCK_UNLOCKED:
1066		case LOCK_READONLY:
1067			break;
1068		case LOCK_WRITEONLY:
1069		case LOCK_READWRITE:
1070		case LOCK_DISCARD:
1071			dirty = true;
1072			break;
1073		default:
1074			ASSERT(false);
1075		}
1076
1077		if(buffer)
1078		{
1079			switch(format)
1080			{
1081			#if S3TC_SUPPORT
1082			case FORMAT_DXT1:
1083			#endif
1084			case FORMAT_ATI1:
1085			case FORMAT_ETC1:
1086			case FORMAT_R11_EAC:
1087			case FORMAT_SIGNED_R11_EAC:
1088			case FORMAT_RGB8_ETC2:
1089			case FORMAT_SRGB8_ETC2:
1090			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1091			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1092				return (unsigned char*)buffer + 8 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1093			case FORMAT_RG11_EAC:
1094			case FORMAT_SIGNED_RG11_EAC:
1095			case FORMAT_RGBA8_ETC2_EAC:
1096			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1097			case FORMAT_RGBA_ASTC_4x4_KHR:
1098			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1099				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1100			case FORMAT_RGBA_ASTC_5x4_KHR:
1101			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1102				return (unsigned char*)buffer + 16 * (x / 5) + (y / 4) * pitchB + z * sliceB;
1103			case FORMAT_RGBA_ASTC_5x5_KHR:
1104			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1105				return (unsigned char*)buffer + 16 * (x / 5) + (y / 5) * pitchB + z * sliceB;
1106			case FORMAT_RGBA_ASTC_6x5_KHR:
1107			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1108				return (unsigned char*)buffer + 16 * (x / 6) + (y / 5) * pitchB + z * sliceB;
1109			case FORMAT_RGBA_ASTC_6x6_KHR:
1110			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1111				return (unsigned char*)buffer + 16 * (x / 6) + (y / 6) * pitchB + z * sliceB;
1112			case FORMAT_RGBA_ASTC_8x5_KHR:
1113			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1114				return (unsigned char*)buffer + 16 * (x / 8) + (y / 5) * pitchB + z * sliceB;
1115			case FORMAT_RGBA_ASTC_8x6_KHR:
1116			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1117				return (unsigned char*)buffer + 16 * (x / 8) + (y / 6) * pitchB + z * sliceB;
1118			case FORMAT_RGBA_ASTC_8x8_KHR:
1119			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1120				return (unsigned char*)buffer + 16 * (x / 8) + (y / 8) * pitchB + z * sliceB;
1121			case FORMAT_RGBA_ASTC_10x5_KHR:
1122			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1123				return (unsigned char*)buffer + 16 * (x / 10) + (y / 5) * pitchB + z * sliceB;
1124			case FORMAT_RGBA_ASTC_10x6_KHR:
1125			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1126				return (unsigned char*)buffer + 16 * (x / 10) + (y / 6) * pitchB + z * sliceB;
1127			case FORMAT_RGBA_ASTC_10x8_KHR:
1128			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1129				return (unsigned char*)buffer + 16 * (x / 10) + (y / 8) * pitchB + z * sliceB;
1130			case FORMAT_RGBA_ASTC_10x10_KHR:
1131			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1132				return (unsigned char*)buffer + 16 * (x / 10) + (y / 10) * pitchB + z * sliceB;
1133			case FORMAT_RGBA_ASTC_12x10_KHR:
1134			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1135				return (unsigned char*)buffer + 16 * (x / 12) + (y / 10) * pitchB + z * sliceB;
1136			case FORMAT_RGBA_ASTC_12x12_KHR:
1137			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1138				return (unsigned char*)buffer + 16 * (x / 12) + (y / 12) * pitchB + z * sliceB;
1139			#if S3TC_SUPPORT
1140			case FORMAT_DXT3:
1141			case FORMAT_DXT5:
1142			#endif
1143			case FORMAT_ATI2:
1144				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1145			default:
1146				return (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
1147			}
1148		}
1149
1150		return 0;
1151	}
1152
1153	void Surface::Buffer::unlockRect()
1154	{
1155		lock = LOCK_UNLOCKED;
1156	}
1157
1158	Surface::Surface(int width, int height, int depth, Format format, void *pixels, int pitch, int slice) : lockable(true), renderTarget(false)
1159	{
1160		resource = new Resource(0);
1161		hasParent = false;
1162		ownExternal = false;
1163		depth = max(1, depth);
1164
1165		external.buffer = pixels;
1166		external.width = width;
1167		external.height = height;
1168		external.depth = depth;
1169		external.format = format;
1170		external.bytes = bytes(external.format);
1171		external.pitchB = pitch;
1172		external.pitchP = pitch / external.bytes;
1173		external.sliceB = slice;
1174		external.sliceP = slice / external.bytes;
1175		external.lock = LOCK_UNLOCKED;
1176		external.dirty = true;
1177
1178		internal.buffer = 0;
1179		internal.width = width;
1180		internal.height = height;
1181		internal.depth = depth;
1182		internal.format = selectInternalFormat(format);
1183		internal.bytes = bytes(internal.format);
1184		internal.pitchB = pitchB(internal.width, internal.format, false);
1185		internal.pitchP = pitchP(internal.width, internal.format, false);
1186		internal.sliceB = sliceB(internal.width, internal.height, internal.format, false);
1187		internal.sliceP = sliceP(internal.width, internal.height, internal.format, false);
1188		internal.lock = LOCK_UNLOCKED;
1189		internal.dirty = false;
1190
1191		stencil.buffer = 0;
1192		stencil.width = width;
1193		stencil.height = height;
1194		stencil.depth = depth;
1195		stencil.format = FORMAT_S8;
1196		stencil.bytes = bytes(stencil.format);
1197		stencil.pitchB = pitchB(stencil.width, stencil.format, false);
1198		stencil.pitchP = pitchP(stencil.width, stencil.format, false);
1199		stencil.sliceB = sliceB(stencil.width, stencil.height, stencil.format, false);
1200		stencil.sliceP = sliceP(stencil.width, stencil.height, stencil.format, false);
1201		stencil.lock = LOCK_UNLOCKED;
1202		stencil.dirty = false;
1203
1204		dirtyMipmaps = true;
1205		paletteUsed = 0;
1206	}
1207
1208	Surface::Surface(Resource *texture, int width, int height, int depth, Format format, bool lockable, bool renderTarget) : lockable(lockable), renderTarget(renderTarget)
1209	{
1210		resource = texture ? texture : new Resource(0);
1211		hasParent = texture != 0;
1212		ownExternal = true;
1213		depth = max(1, depth);
1214
1215		external.buffer = 0;
1216		external.width = width;
1217		external.height = height;
1218		external.depth = depth;
1219		external.format = format;
1220		external.bytes = bytes(external.format);
1221		external.pitchB = pitchB(external.width, external.format, renderTarget && !texture);
1222		external.pitchP = pitchP(external.width, external.format, renderTarget && !texture);
1223		external.sliceB = sliceB(external.width, external.height, external.format, renderTarget && !texture);
1224		external.sliceP = sliceP(external.width, external.height, external.format, renderTarget && !texture);
1225		external.lock = LOCK_UNLOCKED;
1226		external.dirty = false;
1227
1228		internal.buffer = 0;
1229		internal.width = width;
1230		internal.height = height;
1231		internal.depth = depth;
1232		internal.format = selectInternalFormat(format);
1233		internal.bytes = bytes(internal.format);
1234		internal.pitchB = pitchB(internal.width, internal.format, renderTarget);
1235		internal.pitchP = pitchP(internal.width, internal.format, renderTarget);
1236		internal.sliceB = sliceB(internal.width, internal.height, internal.format, renderTarget);
1237		internal.sliceP = sliceP(internal.width, internal.height, internal.format, renderTarget);
1238		internal.lock = LOCK_UNLOCKED;
1239		internal.dirty = false;
1240
1241		stencil.buffer = 0;
1242		stencil.width = width;
1243		stencil.height = height;
1244		stencil.depth = depth;
1245		stencil.format = FORMAT_S8;
1246		stencil.bytes = bytes(stencil.format);
1247		stencil.pitchB = pitchB(stencil.width, stencil.format, renderTarget);
1248		stencil.pitchP = pitchP(stencil.width, stencil.format, renderTarget);
1249		stencil.sliceB = sliceB(stencil.width, stencil.height, stencil.format, renderTarget);
1250		stencil.sliceP = sliceP(stencil.width, stencil.height, stencil.format, renderTarget);
1251		stencil.lock = LOCK_UNLOCKED;
1252		stencil.dirty = false;
1253
1254		dirtyMipmaps = true;
1255		paletteUsed = 0;
1256	}
1257
1258	Surface::~Surface()
1259	{
1260		// Synchronize so we can deallocate the buffers below
1261		resource->lock(DESTRUCT);
1262		resource->unlock();
1263
1264		if(!hasParent)
1265		{
1266			resource->destruct();
1267		}
1268
1269		if(ownExternal)
1270		{
1271			deallocate(external.buffer);
1272		}
1273
1274		if(internal.buffer != external.buffer)
1275		{
1276			deallocate(internal.buffer);
1277		}
1278
1279		deallocate(stencil.buffer);
1280
1281		external.buffer = 0;
1282		internal.buffer = 0;
1283		stencil.buffer = 0;
1284	}
1285
1286	void *Surface::lockExternal(int x, int y, int z, Lock lock, Accessor client)
1287	{
1288		resource->lock(client);
1289
1290		if(!external.buffer)
1291		{
1292			if(internal.buffer && identicalFormats())
1293			{
1294				external.buffer = internal.buffer;
1295			}
1296			else
1297			{
1298				external.buffer = allocateBuffer(external.width, external.height, external.depth, external.format);
1299			}
1300		}
1301
1302		if(internal.dirty)
1303		{
1304			if(lock != LOCK_DISCARD)
1305			{
1306				update(external, internal);
1307			}
1308
1309			internal.dirty = false;
1310		}
1311
1312		switch(lock)
1313		{
1314		case LOCK_READONLY:
1315			break;
1316		case LOCK_WRITEONLY:
1317		case LOCK_READWRITE:
1318		case LOCK_DISCARD:
1319			dirtyMipmaps = true;
1320			break;
1321		default:
1322			ASSERT(false);
1323		}
1324
1325		return external.lockRect(x, y, z, lock);
1326	}
1327
1328	void Surface::unlockExternal()
1329	{
1330		resource->unlock();
1331
1332		external.unlockRect();
1333	}
1334
1335	void *Surface::lockInternal(int x, int y, int z, Lock lock, Accessor client)
1336	{
1337		if(lock != LOCK_UNLOCKED)
1338		{
1339			resource->lock(client);
1340		}
1341
1342		if(!internal.buffer)
1343		{
1344			if(external.buffer && identicalFormats())
1345			{
1346				internal.buffer = external.buffer;
1347			}
1348			else
1349			{
1350				internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.format);
1351			}
1352		}
1353
1354		// FIXME: WHQL requires conversion to lower external precision and back
1355		if(logPrecision >= WHQL)
1356		{
1357			if(internal.dirty && renderTarget && internal.format != external.format)
1358			{
1359				if(lock != LOCK_DISCARD)
1360				{
1361					switch(external.format)
1362					{
1363					case FORMAT_R3G3B2:
1364					case FORMAT_A8R3G3B2:
1365					case FORMAT_A1R5G5B5:
1366					case FORMAT_A2R10G10B10:
1367					case FORMAT_A2B10G10R10:
1368						lockExternal(0, 0, 0, LOCK_READWRITE, client);
1369						unlockExternal();
1370						break;
1371					default:
1372						// Difference passes WHQL
1373						break;
1374					}
1375				}
1376			}
1377		}
1378
1379		if(external.dirty || (isPalette(external.format) && paletteUsed != Surface::paletteID))
1380		{
1381			if(lock != LOCK_DISCARD)
1382			{
1383				update(internal, external);
1384			}
1385
1386			external.dirty = false;
1387			paletteUsed = Surface::paletteID;
1388		}
1389
1390		switch(lock)
1391		{
1392		case LOCK_UNLOCKED:
1393		case LOCK_READONLY:
1394			break;
1395		case LOCK_WRITEONLY:
1396		case LOCK_READWRITE:
1397		case LOCK_DISCARD:
1398			dirtyMipmaps = true;
1399			break;
1400		default:
1401			ASSERT(false);
1402		}
1403
1404		if(lock == LOCK_READONLY && client == PUBLIC)
1405		{
1406			resolve();
1407		}
1408
1409		return internal.lockRect(x, y, z, lock);
1410	}
1411
1412	void Surface::unlockInternal()
1413	{
1414		resource->unlock();
1415
1416		internal.unlockRect();
1417	}
1418
1419	void *Surface::lockStencil(int front, Accessor client)
1420	{
1421		resource->lock(client);
1422
1423		if(!stencil.buffer)
1424		{
1425			stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.format);
1426		}
1427
1428		return stencil.lockRect(0, 0, front, LOCK_READWRITE);   // FIXME
1429	}
1430
1431	void Surface::unlockStencil()
1432	{
1433		resource->unlock();
1434
1435		stencil.unlockRect();
1436	}
1437
1438	int Surface::bytes(Format format)
1439	{
1440		switch(format)
1441		{
1442		case FORMAT_NULL:				return 0;
1443		case FORMAT_P8:					return 1;
1444		case FORMAT_A8P8:				return 2;
1445		case FORMAT_A8:					return 1;
1446		case FORMAT_R8I:				return 1;
1447		case FORMAT_R8:					return 1;
1448		case FORMAT_R3G3B2:				return 1;
1449		case FORMAT_R16I:				return 2;
1450		case FORMAT_R16UI:				return 2;
1451		case FORMAT_A8R3G3B2:			return 2;
1452		case FORMAT_R5G6B5:				return 2;
1453		case FORMAT_A1R5G5B5:			return 2;
1454		case FORMAT_X1R5G5B5:			return 2;
1455		case FORMAT_R5G5B5A1:           return 2;
1456		case FORMAT_X4R4G4B4:			return 2;
1457		case FORMAT_A4R4G4B4:			return 2;
1458		case FORMAT_R4G4B4A4:           return 2;
1459		case FORMAT_R8G8B8:				return 3;
1460		case FORMAT_B8G8R8:             return 3;
1461		case FORMAT_R32I:				return 4;
1462		case FORMAT_R32UI:				return 4;
1463		case FORMAT_X8R8G8B8:			return 4;
1464	//	case FORMAT_X8G8R8B8Q:			return 4;
1465		case FORMAT_A8R8G8B8:			return 4;
1466	//	case FORMAT_A8G8R8B8Q:			return 4;
1467		case FORMAT_X8B8G8R8I:			return 4;
1468		case FORMAT_X8B8G8R8:			return 4;
1469		case FORMAT_A8B8G8R8I:			return 4;
1470		case FORMAT_R8UI:				return 1;
1471		case FORMAT_G8R8UI:				return 2;
1472		case FORMAT_X8B8G8R8UI:			return 4;
1473		case FORMAT_A8B8G8R8UI:			return 4;
1474		case FORMAT_A8B8G8R8:			return 4;
1475		case FORMAT_R8I_SNORM:			return 1;
1476		case FORMAT_G8R8I_SNORM:		return 2;
1477		case FORMAT_X8B8G8R8I_SNORM:	return 4;
1478		case FORMAT_A8B8G8R8I_SNORM:	return 4;
1479		case FORMAT_A2R10G10B10:		return 4;
1480		case FORMAT_A2B10G10R10:		return 4;
1481		case FORMAT_G8R8I:				return 2;
1482		case FORMAT_G8R8:				return 2;
1483		case FORMAT_G16R16I:			return 4;
1484		case FORMAT_G16R16UI:			return 4;
1485		case FORMAT_G16R16:				return 4;
1486		case FORMAT_G32R32I:			return 8;
1487		case FORMAT_G32R32UI:			return 8;
1488		case FORMAT_X16B16G16R16I:		return 8;
1489		case FORMAT_X16B16G16R16UI:		return 8;
1490		case FORMAT_A16B16G16R16I:		return 8;
1491		case FORMAT_A16B16G16R16UI:		return 8;
1492		case FORMAT_A16B16G16R16:		return 8;
1493		case FORMAT_X32B32G32R32I:		return 16;
1494		case FORMAT_X32B32G32R32UI:		return 16;
1495		case FORMAT_A32B32G32R32I:		return 16;
1496		case FORMAT_A32B32G32R32UI:		return 16;
1497		// Compressed formats
1498		#if S3TC_SUPPORT
1499		case FORMAT_DXT1:				return 2;   // Column of four pixels
1500		case FORMAT_DXT3:				return 4;   // Column of four pixels
1501		case FORMAT_DXT5:				return 4;   // Column of four pixels
1502		#endif
1503		case FORMAT_ATI1:				return 2;   // Column of four pixels
1504		case FORMAT_ATI2:				return 4;   // Column of four pixels
1505		case FORMAT_ETC1:				return 2;   // Column of four pixels
1506		case FORMAT_R11_EAC:			return 2;
1507		case FORMAT_SIGNED_R11_EAC:		return 2;
1508		case FORMAT_RG11_EAC:			return 4;
1509		case FORMAT_SIGNED_RG11_EAC:	return 4;
1510		case FORMAT_RGB8_ETC2:			return 2;
1511		case FORMAT_SRGB8_ETC2:			return 2;
1512		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
1513		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
1514		case FORMAT_RGBA8_ETC2_EAC:			return 4;
1515		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:	return 4;
1516		case FORMAT_RGBA_ASTC_4x4_KHR:
1517		case FORMAT_RGBA_ASTC_5x4_KHR:
1518		case FORMAT_RGBA_ASTC_5x5_KHR:
1519		case FORMAT_RGBA_ASTC_6x5_KHR:
1520		case FORMAT_RGBA_ASTC_6x6_KHR:
1521		case FORMAT_RGBA_ASTC_8x5_KHR:
1522		case FORMAT_RGBA_ASTC_8x6_KHR:
1523		case FORMAT_RGBA_ASTC_8x8_KHR:
1524		case FORMAT_RGBA_ASTC_10x5_KHR:
1525		case FORMAT_RGBA_ASTC_10x6_KHR:
1526		case FORMAT_RGBA_ASTC_10x8_KHR:
1527		case FORMAT_RGBA_ASTC_10x10_KHR:
1528		case FORMAT_RGBA_ASTC_12x10_KHR:
1529		case FORMAT_RGBA_ASTC_12x12_KHR:
1530		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1531		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1532		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1533		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1534		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1535		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1536		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1537		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1538		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1539		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1540		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1541		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1542		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1543		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: return 0; // FIXME
1544		// Bumpmap formats
1545		case FORMAT_V8U8:				return 2;
1546		case FORMAT_L6V5U5:				return 2;
1547		case FORMAT_Q8W8V8U8:			return 4;
1548		case FORMAT_X8L8V8U8:			return 4;
1549		case FORMAT_A2W10V10U10:		return 4;
1550		case FORMAT_V16U16:				return 4;
1551		case FORMAT_A16W16V16U16:		return 8;
1552		case FORMAT_Q16W16V16U16:		return 8;
1553		// Luminance formats
1554		case FORMAT_L8:					return 1;
1555		case FORMAT_A4L4:				return 1;
1556		case FORMAT_L16:				return 2;
1557		case FORMAT_A8L8:				return 2;
1558		case FORMAT_L16F:               return 2;
1559		case FORMAT_A16L16F:            return 4;
1560		case FORMAT_L32F:               return 4;
1561		case FORMAT_A32L32F:            return 8;
1562		// Floating-point formats
1563		case FORMAT_A16F:				return 2;
1564		case FORMAT_R16F:				return 2;
1565		case FORMAT_G16R16F:			return 4;
1566		case FORMAT_B16G16R16F:			return 6;
1567		case FORMAT_A16B16G16R16F:		return 8;
1568		case FORMAT_A32F:				return 4;
1569		case FORMAT_R32F:				return 4;
1570		case FORMAT_G32R32F:			return 8;
1571		case FORMAT_B32G32R32F:			return 12;
1572		case FORMAT_A32B32G32R32F:		return 16;
1573		// Depth/stencil formats
1574		case FORMAT_D16:				return 2;
1575		case FORMAT_D32:				return 4;
1576		case FORMAT_D24X8:				return 4;
1577		case FORMAT_D24S8:				return 4;
1578		case FORMAT_D24FS8:				return 4;
1579		case FORMAT_D32F:				return 4;
1580		case FORMAT_D32F_COMPLEMENTARY:	return 4;
1581		case FORMAT_D32F_LOCKABLE:		return 4;
1582		case FORMAT_D32FS8_TEXTURE:		return 4;
1583		case FORMAT_D32FS8_SHADOW:		return 4;
1584		case FORMAT_DF24S8:				return 4;
1585		case FORMAT_DF16S8:				return 2;
1586		case FORMAT_INTZ:				return 4;
1587		case FORMAT_S8:					return 1;
1588		case FORMAT_YV12_BT601:         return 1;   // Y plane only
1589		case FORMAT_YV12_BT709:         return 1;   // Y plane only
1590		case FORMAT_YV12_JFIF:          return 1;   // Y plane only
1591		default:
1592			ASSERT(false);
1593		}
1594
1595		return 0;
1596	}
1597
1598	int Surface::pitchB(int width, Format format, bool target)
1599	{
1600		if(target || isDepth(format) || isStencil(format))
1601		{
1602			width = align(width, 2);
1603		}
1604
1605		switch(format)
1606		{
1607		#if S3TC_SUPPORT
1608		case FORMAT_DXT1:
1609		#endif
1610		case FORMAT_ETC1:
1611		case FORMAT_R11_EAC:
1612		case FORMAT_SIGNED_R11_EAC:
1613		case FORMAT_RGB8_ETC2:
1614		case FORMAT_SRGB8_ETC2:
1615		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1616		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1617			return 8 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per 4 rows
1618		case FORMAT_RG11_EAC:
1619		case FORMAT_SIGNED_RG11_EAC:
1620		case FORMAT_RGBA8_ETC2_EAC:
1621		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1622		case FORMAT_RGBA_ASTC_4x4_KHR:
1623		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1624			return 16 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per 4 rows
1625		case FORMAT_RGBA_ASTC_5x4_KHR:
1626		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1627		case FORMAT_RGBA_ASTC_5x5_KHR:
1628		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1629			return 16 * ((width + 4) / 5);
1630		case FORMAT_RGBA_ASTC_6x5_KHR:
1631		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1632		case FORMAT_RGBA_ASTC_6x6_KHR:
1633		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1634			return 16 * ((width + 5) / 6);
1635		case FORMAT_RGBA_ASTC_8x5_KHR:
1636		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1637		case FORMAT_RGBA_ASTC_8x6_KHR:
1638		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1639		case FORMAT_RGBA_ASTC_8x8_KHR:
1640		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1641			return 16 * ((width + 7) / 8);
1642		case FORMAT_RGBA_ASTC_10x5_KHR:
1643		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1644		case FORMAT_RGBA_ASTC_10x6_KHR:
1645		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1646		case FORMAT_RGBA_ASTC_10x8_KHR:
1647		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1648		case FORMAT_RGBA_ASTC_10x10_KHR:
1649		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1650			return 16 * ((width + 9) / 10);
1651		case FORMAT_RGBA_ASTC_12x10_KHR:
1652		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1653		case FORMAT_RGBA_ASTC_12x12_KHR:
1654		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1655			return 16 * ((width + 11) / 12);
1656		#if S3TC_SUPPORT
1657		case FORMAT_DXT3:
1658		case FORMAT_DXT5:
1659			return 16 * ((width + 3) / 4);   // 128 bit per 4x4 block, computed per 4 rows
1660		#endif
1661		case FORMAT_ATI1:
1662			return 2 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per row
1663		case FORMAT_ATI2:
1664			return 4 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per row
1665		case FORMAT_YV12_BT601:
1666		case FORMAT_YV12_BT709:
1667		case FORMAT_YV12_JFIF:
1668			return align(width, 16);
1669		default:
1670			return bytes(format) * width;
1671		}
1672	}
1673
1674	int Surface::pitchP(int width, Format format, bool target)
1675	{
1676		int B = bytes(format);
1677
1678		return B > 0 ? pitchB(width, format, target) / B : 0;
1679	}
1680
1681	int Surface::sliceB(int width, int height, Format format, bool target)
1682	{
1683		if(target || isDepth(format) || isStencil(format))
1684		{
1685			height = ((height + 1) & ~1);
1686		}
1687
1688		switch(format)
1689		{
1690		#if S3TC_SUPPORT
1691		case FORMAT_DXT1:
1692		case FORMAT_DXT3:
1693		case FORMAT_DXT5:
1694		#endif
1695		case FORMAT_ETC1:
1696		case FORMAT_R11_EAC:
1697		case FORMAT_SIGNED_R11_EAC:
1698		case FORMAT_RG11_EAC:
1699		case FORMAT_SIGNED_RG11_EAC:
1700		case FORMAT_RGB8_ETC2:
1701		case FORMAT_SRGB8_ETC2:
1702		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1703		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1704		case FORMAT_RGBA8_ETC2_EAC:
1705		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1706		case FORMAT_RGBA_ASTC_4x4_KHR:
1707		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1708		case FORMAT_RGBA_ASTC_5x4_KHR:
1709		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1710			return pitchB(width, format, target) * ((height + 3) / 4);   // Pitch computed per 4 rows
1711		case FORMAT_RGBA_ASTC_5x5_KHR:
1712		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1713		case FORMAT_RGBA_ASTC_6x5_KHR:
1714		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1715		case FORMAT_RGBA_ASTC_8x5_KHR:
1716		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1717		case FORMAT_RGBA_ASTC_10x5_KHR:
1718		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1719			return pitchB(width, format, target) * ((height + 4) / 5);   // Pitch computed per 5 rows
1720		case FORMAT_RGBA_ASTC_6x6_KHR:
1721		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1722		case FORMAT_RGBA_ASTC_8x6_KHR:
1723		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1724		case FORMAT_RGBA_ASTC_10x6_KHR:
1725		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1726			return pitchB(width, format, target) * ((height + 5) / 6);   // Pitch computed per 6 rows
1727		case FORMAT_RGBA_ASTC_8x8_KHR:
1728		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1729		case FORMAT_RGBA_ASTC_10x8_KHR:
1730		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1731			return pitchB(width, format, target) * ((height + 7) / 8);   // Pitch computed per 8 rows
1732		case FORMAT_RGBA_ASTC_10x10_KHR:
1733		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1734		case FORMAT_RGBA_ASTC_12x10_KHR:
1735		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1736			return pitchB(width, format, target) * ((height + 9) / 10);   // Pitch computed per 10 rows
1737		case FORMAT_RGBA_ASTC_12x12_KHR:
1738		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1739			return pitchB(width, format, target) * ((height + 11) / 12);   // Pitch computed per 12 rows
1740		case FORMAT_ATI1:
1741		case FORMAT_ATI2:
1742		default:
1743			return pitchB(width, format, target) * height;   // Pitch computed per row
1744		}
1745	}
1746
1747	int Surface::sliceP(int width, int height, Format format, bool target)
1748	{
1749		int B = bytes(format);
1750
1751		return B > 0 ? sliceB(width, height, format, target) / B : 0;
1752	}
1753
1754	void Surface::update(Buffer &destination, Buffer &source)
1755	{
1756	//	ASSERT(source.lock != LOCK_UNLOCKED);
1757	//	ASSERT(destination.lock != LOCK_UNLOCKED);
1758
1759		if(destination.buffer != source.buffer)
1760		{
1761			ASSERT(source.dirty && !destination.dirty);
1762
1763			switch(source.format)
1764			{
1765			case FORMAT_R8G8B8:		decodeR8G8B8(destination, source);		break;   // FIXME: Check destination format
1766			case FORMAT_X1R5G5B5:	decodeX1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1767			case FORMAT_A1R5G5B5:	decodeA1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1768			case FORMAT_X4R4G4B4:	decodeX4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1769			case FORMAT_A4R4G4B4:	decodeA4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1770			case FORMAT_P8:			decodeP8(destination, source);			break;   // FIXME: Check destination format
1771			#if S3TC_SUPPORT
1772			case FORMAT_DXT1:		decodeDXT1(destination, source);		break;   // FIXME: Check destination format
1773			case FORMAT_DXT3:		decodeDXT3(destination, source);		break;   // FIXME: Check destination format
1774			case FORMAT_DXT5:		decodeDXT5(destination, source);		break;   // FIXME: Check destination format
1775			#endif
1776			case FORMAT_ATI1:		decodeATI1(destination, source);		break;   // FIXME: Check destination format
1777			case FORMAT_ATI2:		decodeATI2(destination, source);		break;   // FIXME: Check destination format
1778			case FORMAT_R11_EAC:         decodeEAC(destination, source, 1, false); break; // FIXME: Check destination format
1779			case FORMAT_SIGNED_R11_EAC:  decodeEAC(destination, source, 1, true);  break; // FIXME: Check destination format
1780			case FORMAT_RG11_EAC:        decodeEAC(destination, source, 2, false); break; // FIXME: Check destination format
1781			case FORMAT_SIGNED_RG11_EAC: decodeEAC(destination, source, 2, true);  break; // FIXME: Check destination format
1782			case FORMAT_ETC1:
1783			case FORMAT_RGB8_ETC2:                      decodeETC2(destination, source, 0, false); break; // FIXME: Check destination format
1784			case FORMAT_SRGB8_ETC2:                     decodeETC2(destination, source, 0, true);  break; // FIXME: Check destination format
1785			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:  decodeETC2(destination, source, 1, false); break; // FIXME: Check destination format
1786			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2: decodeETC2(destination, source, 1, true);  break; // FIXME: Check destination format
1787			case FORMAT_RGBA8_ETC2_EAC:                 decodeETC2(destination, source, 8, false); break; // FIXME: Check destination format
1788			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:          decodeETC2(destination, source, 8, true);  break; // FIXME: Check destination format
1789			case FORMAT_RGBA_ASTC_4x4_KHR:           decodeASTC(destination, source, 4,  4,  1, false); break; // FIXME: Check destination format
1790			case FORMAT_RGBA_ASTC_5x4_KHR:           decodeASTC(destination, source, 5,  4,  1, false); break; // FIXME: Check destination format
1791			case FORMAT_RGBA_ASTC_5x5_KHR:           decodeASTC(destination, source, 5,  5,  1, false); break; // FIXME: Check destination format
1792			case FORMAT_RGBA_ASTC_6x5_KHR:           decodeASTC(destination, source, 6,  5,  1, false); break; // FIXME: Check destination format
1793			case FORMAT_RGBA_ASTC_6x6_KHR:           decodeASTC(destination, source, 6,  6,  1, false); break; // FIXME: Check destination format
1794			case FORMAT_RGBA_ASTC_8x5_KHR:           decodeASTC(destination, source, 8,  5,  1, false); break; // FIXME: Check destination format
1795			case FORMAT_RGBA_ASTC_8x6_KHR:           decodeASTC(destination, source, 8,  6,  1, false); break; // FIXME: Check destination format
1796			case FORMAT_RGBA_ASTC_8x8_KHR:           decodeASTC(destination, source, 8,  8,  1, false); break; // FIXME: Check destination format
1797			case FORMAT_RGBA_ASTC_10x5_KHR:          decodeASTC(destination, source, 10, 5,  1, false); break; // FIXME: Check destination format
1798			case FORMAT_RGBA_ASTC_10x6_KHR:          decodeASTC(destination, source, 10, 6,  1, false); break; // FIXME: Check destination format
1799			case FORMAT_RGBA_ASTC_10x8_KHR:          decodeASTC(destination, source, 10, 8,  1, false); break; // FIXME: Check destination format
1800			case FORMAT_RGBA_ASTC_10x10_KHR:         decodeASTC(destination, source, 10, 10, 1, false); break; // FIXME: Check destination format
1801			case FORMAT_RGBA_ASTC_12x10_KHR:         decodeASTC(destination, source, 12, 10, 1, false); break; // FIXME: Check destination format
1802			case FORMAT_RGBA_ASTC_12x12_KHR:         decodeASTC(destination, source, 12, 12, 1, false); break; // FIXME: Check destination format
1803			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:   decodeASTC(destination, source, 4,  4,  1, true);  break; // FIXME: Check destination format
1804			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:   decodeASTC(destination, source, 5,  4,  1, true);  break; // FIXME: Check destination format
1805			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:   decodeASTC(destination, source, 5,  5,  1, true);  break; // FIXME: Check destination format
1806			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:   decodeASTC(destination, source, 6,  5,  1, true);  break; // FIXME: Check destination format
1807			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:   decodeASTC(destination, source, 6,  6,  1, true);  break; // FIXME: Check destination format
1808			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:   decodeASTC(destination, source, 8,  5,  1, true);  break; // FIXME: Check destination format
1809			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:   decodeASTC(destination, source, 8,  6,  1, true);  break; // FIXME: Check destination format
1810			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:   decodeASTC(destination, source, 8,  8,  1, true);  break; // FIXME: Check destination format
1811			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:  decodeASTC(destination, source, 10, 5,  1, true);  break; // FIXME: Check destination format
1812			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:  decodeASTC(destination, source, 10, 6,  1, true);  break; // FIXME: Check destination format
1813			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:  decodeASTC(destination, source, 10, 8,  1, true);  break; // FIXME: Check destination format
1814			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR: decodeASTC(destination, source, 10, 10, 1, true);  break; // FIXME: Check destination format
1815			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR: decodeASTC(destination, source, 12, 10, 1, true);  break; // FIXME: Check destination format
1816			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: decodeASTC(destination, source, 12, 12, 1, true);  break; // FIXME: Check destination format
1817			default:				genericUpdate(destination, source);		break;
1818			}
1819		}
1820	}
1821
1822	void Surface::genericUpdate(Buffer &destination, Buffer &source)
1823	{
1824		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1825		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1826
1827		int depth = min(destination.depth, source.depth);
1828		int height = min(destination.height, source.height);
1829		int width = min(destination.width, source.width);
1830		int rowBytes = width * source.bytes;
1831
1832		for(int z = 0; z < depth; z++)
1833		{
1834			unsigned char *sourceRow = sourceSlice;
1835			unsigned char *destinationRow = destinationSlice;
1836
1837			for(int y = 0; y < height; y++)
1838			{
1839				if(source.format == destination.format)
1840				{
1841					memcpy(destinationRow, sourceRow, rowBytes);
1842				}
1843				else
1844				{
1845					unsigned char *sourceElement = sourceRow;
1846					unsigned char *destinationElement = destinationRow;
1847
1848					for(int x = 0; x < width; x++)
1849					{
1850						Color<float> color = source.read(sourceElement);
1851						destination.write(destinationElement, color);
1852
1853						sourceElement += source.bytes;
1854						destinationElement += destination.bytes;
1855					}
1856				}
1857
1858				sourceRow += source.pitchB;
1859				destinationRow += destination.pitchB;
1860			}
1861
1862			sourceSlice += source.sliceB;
1863			destinationSlice += destination.sliceB;
1864		}
1865	}
1866
1867	void Surface::decodeR8G8B8(Buffer &destination, const Buffer &source)
1868	{
1869		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1870		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1871
1872		for(int z = 0; z < destination.depth && z < source.depth; z++)
1873		{
1874			unsigned char *sourceRow = sourceSlice;
1875			unsigned char *destinationRow = destinationSlice;
1876
1877			for(int y = 0; y < destination.height && y < source.height; y++)
1878			{
1879				unsigned char *sourceElement = sourceRow;
1880				unsigned char *destinationElement = destinationRow;
1881
1882				for(int x = 0; x < destination.width && x < source.width; x++)
1883				{
1884					unsigned int b = sourceElement[0];
1885					unsigned int g = sourceElement[1];
1886					unsigned int r = sourceElement[2];
1887
1888					*(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0);
1889
1890					sourceElement += source.bytes;
1891					destinationElement += destination.bytes;
1892				}
1893
1894				sourceRow += source.pitchB;
1895				destinationRow += destination.pitchB;
1896			}
1897
1898			sourceSlice += source.sliceB;
1899			destinationSlice += destination.sliceB;
1900		}
1901	}
1902
1903	void Surface::decodeX1R5G5B5(Buffer &destination, const Buffer &source)
1904	{
1905		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1906		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1907
1908		for(int z = 0; z < destination.depth && z < source.depth; z++)
1909		{
1910			unsigned char *sourceRow = sourceSlice;
1911			unsigned char *destinationRow = destinationSlice;
1912
1913			for(int y = 0; y < destination.height && y < source.height; y++)
1914			{
1915				unsigned char *sourceElement = sourceRow;
1916				unsigned char *destinationElement = destinationRow;
1917
1918				for(int x = 0; x < destination.width && x < source.width; x++)
1919				{
1920					unsigned int xrgb = *(unsigned short*)sourceElement;
1921
1922					unsigned int r = (((xrgb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
1923					unsigned int g = (((xrgb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
1924					unsigned int b = (((xrgb & 0x001F) * 2106  + 0x80) >> 8);
1925
1926					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
1927
1928					sourceElement += source.bytes;
1929					destinationElement += destination.bytes;
1930				}
1931
1932				sourceRow += source.pitchB;
1933				destinationRow += destination.pitchB;
1934			}
1935
1936			sourceSlice += source.sliceB;
1937			destinationSlice += destination.sliceB;
1938		}
1939	}
1940
1941	void Surface::decodeA1R5G5B5(Buffer &destination, const Buffer &source)
1942	{
1943		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1944		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1945
1946		for(int z = 0; z < destination.depth && z < source.depth; z++)
1947		{
1948			unsigned char *sourceRow = sourceSlice;
1949			unsigned char *destinationRow = destinationSlice;
1950
1951			for(int y = 0; y < destination.height && y < source.height; y++)
1952			{
1953				unsigned char *sourceElement = sourceRow;
1954				unsigned char *destinationElement = destinationRow;
1955
1956				for(int x = 0; x < destination.width && x < source.width; x++)
1957				{
1958					unsigned int argb = *(unsigned short*)sourceElement;
1959
1960					unsigned int a =   (argb & 0x8000) * 130560;
1961					unsigned int r = (((argb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
1962					unsigned int g = (((argb & 0x03E0) * 16846  + 0x8000) >> 8) & 0x0000FF00;
1963					unsigned int b = (((argb & 0x001F) * 2106   + 0x80) >> 8);
1964
1965					*(unsigned int*)destinationElement = a | r | g | b;
1966
1967					sourceElement += source.bytes;
1968					destinationElement += destination.bytes;
1969				}
1970
1971				sourceRow += source.pitchB;
1972				destinationRow += destination.pitchB;
1973			}
1974
1975			sourceSlice += source.sliceB;
1976			destinationSlice += destination.sliceB;
1977		}
1978	}
1979
1980	void Surface::decodeX4R4G4B4(Buffer &destination, const Buffer &source)
1981	{
1982		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1983		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1984
1985		for(int z = 0; z < destination.depth && z < source.depth; z++)
1986		{
1987			unsigned char *sourceRow = sourceSlice;
1988			unsigned char *destinationRow = destinationSlice;
1989
1990			for(int y = 0; y < destination.height && y < source.height; y++)
1991			{
1992				unsigned char *sourceElement = sourceRow;
1993				unsigned char *destinationElement = destinationRow;
1994
1995				for(int x = 0; x < destination.width && x < source.width; x++)
1996				{
1997					unsigned int xrgb = *(unsigned short*)sourceElement;
1998
1999					unsigned int r = ((xrgb & 0x0F00) * 0x00001100) & 0x00FF0000;
2000					unsigned int g = ((xrgb & 0x00F0) * 0x00000110) & 0x0000FF00;
2001					unsigned int b =  (xrgb & 0x000F) * 0x00000011;
2002
2003					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
2004
2005					sourceElement += source.bytes;
2006					destinationElement += destination.bytes;
2007				}
2008
2009				sourceRow += source.pitchB;
2010				destinationRow += destination.pitchB;
2011			}
2012
2013			sourceSlice += source.sliceB;
2014			destinationSlice += destination.sliceB;
2015		}
2016	}
2017
2018	void Surface::decodeA4R4G4B4(Buffer &destination, const Buffer &source)
2019	{
2020		unsigned char *sourceSlice = (unsigned char*)source.buffer;
2021		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
2022
2023		for(int z = 0; z < destination.depth && z < source.depth; z++)
2024		{
2025			unsigned char *sourceRow = sourceSlice;
2026			unsigned char *destinationRow = destinationSlice;
2027
2028			for(int y = 0; y < destination.height && y < source.height; y++)
2029			{
2030				unsigned char *sourceElement = sourceRow;
2031				unsigned char *destinationElement = destinationRow;
2032
2033				for(int x = 0; x < destination.width && x < source.width; x++)
2034				{
2035					unsigned int argb = *(unsigned short*)sourceElement;
2036
2037					unsigned int a = ((argb & 0xF000) * 0x00011000) & 0xFF000000;
2038					unsigned int r = ((argb & 0x0F00) * 0x00001100) & 0x00FF0000;
2039					unsigned int g = ((argb & 0x00F0) * 0x00000110) & 0x0000FF00;
2040					unsigned int b =  (argb & 0x000F) * 0x00000011;
2041
2042					*(unsigned int*)destinationElement = a | r | g | b;
2043
2044					sourceElement += source.bytes;
2045					destinationElement += destination.bytes;
2046				}
2047
2048				sourceRow += source.pitchB;
2049				destinationRow += destination.pitchB;
2050			}
2051
2052			sourceSlice += source.sliceB;
2053			destinationSlice += destination.sliceB;
2054		}
2055	}
2056
2057	void Surface::decodeP8(Buffer &destination, const Buffer &source)
2058	{
2059		unsigned char *sourceSlice = (unsigned char*)source.buffer;
2060		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
2061
2062		for(int z = 0; z < destination.depth && z < source.depth; z++)
2063		{
2064			unsigned char *sourceRow = sourceSlice;
2065			unsigned char *destinationRow = destinationSlice;
2066
2067			for(int y = 0; y < destination.height && y < source.height; y++)
2068			{
2069				unsigned char *sourceElement = sourceRow;
2070				unsigned char *destinationElement = destinationRow;
2071
2072				for(int x = 0; x < destination.width && x < source.width; x++)
2073				{
2074					unsigned int abgr = palette[*(unsigned char*)sourceElement];
2075
2076					unsigned int r = (abgr & 0x000000FF) << 16;
2077					unsigned int g = (abgr & 0x0000FF00) << 0;
2078					unsigned int b = (abgr & 0x00FF0000) >> 16;
2079					unsigned int a = (abgr & 0xFF000000) >> 0;
2080
2081					*(unsigned int*)destinationElement = a | r | g | b;
2082
2083					sourceElement += source.bytes;
2084					destinationElement += destination.bytes;
2085				}
2086
2087				sourceRow += source.pitchB;
2088				destinationRow += destination.pitchB;
2089			}
2090
2091			sourceSlice += source.sliceB;
2092			destinationSlice += destination.sliceB;
2093		}
2094	}
2095
2096#if S3TC_SUPPORT
2097	void Surface::decodeDXT1(Buffer &internal, const Buffer &external)
2098	{
2099		unsigned int *destSlice = (unsigned int*)internal.buffer;
2100		const DXT1 *source = (const DXT1*)external.buffer;
2101
2102		for(int z = 0; z < external.depth; z++)
2103		{
2104			unsigned int *dest = destSlice;
2105
2106			for(int y = 0; y < external.height; y += 4)
2107			{
2108				for(int x = 0; x < external.width; x += 4)
2109				{
2110					Color<byte> c[4];
2111
2112					c[0] = source->c0;
2113					c[1] = source->c1;
2114
2115					if(source->c0 > source->c1)   // No transparency
2116					{
2117						// c2 = 2 / 3 * c0 + 1 / 3 * c1
2118						c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2119						c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2120						c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2121						c[2].a = 0xFF;
2122
2123						// c3 = 1 / 3 * c0 + 2 / 3 * c1
2124						c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2125						c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2126						c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2127						c[3].a = 0xFF;
2128					}
2129					else   // c3 transparent
2130					{
2131						// c2 = 1 / 2 * c0 + 1 / 2 * c1
2132						c[2].r = (byte)(((word)c[0].r + (word)c[1].r) / 2);
2133						c[2].g = (byte)(((word)c[0].g + (word)c[1].g) / 2);
2134						c[2].b = (byte)(((word)c[0].b + (word)c[1].b) / 2);
2135						c[2].a = 0xFF;
2136
2137						c[3].r = 0;
2138						c[3].g = 0;
2139						c[3].b = 0;
2140						c[3].a = 0;
2141					}
2142
2143					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2144					{
2145						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2146						{
2147							dest[(x + i) + (y + j) * internal.width] = c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4];
2148						}
2149					}
2150
2151					source++;
2152				}
2153			}
2154
2155			(byte*&)destSlice += internal.sliceB;
2156		}
2157	}
2158
2159	void Surface::decodeDXT3(Buffer &internal, const Buffer &external)
2160	{
2161		unsigned int *destSlice = (unsigned int*)internal.buffer;
2162		const DXT3 *source = (const DXT3*)external.buffer;
2163
2164		for(int z = 0; z < external.depth; z++)
2165		{
2166			unsigned int *dest = destSlice;
2167
2168			for(int y = 0; y < external.height; y += 4)
2169			{
2170				for(int x = 0; x < external.width; x += 4)
2171				{
2172					Color<byte> c[4];
2173
2174					c[0] = source->c0;
2175					c[1] = source->c1;
2176
2177					// c2 = 2 / 3 * c0 + 1 / 3 * c1
2178					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2179					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2180					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2181
2182					// c3 = 1 / 3 * c0 + 2 / 3 * c1
2183					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2184					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2185					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2186
2187					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2188					{
2189						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2190						{
2191							unsigned int a = (unsigned int)(source->a >> 4 * (i + j * 4)) & 0x0F;
2192							unsigned int color = (c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | ((a << 28) + (a << 24));
2193
2194							dest[(x + i) + (y + j) * internal.width] = color;
2195						}
2196					}
2197
2198					source++;
2199				}
2200			}
2201
2202			(byte*&)destSlice += internal.sliceB;
2203		}
2204	}
2205
2206	void Surface::decodeDXT5(Buffer &internal, const Buffer &external)
2207	{
2208		unsigned int *destSlice = (unsigned int*)internal.buffer;
2209		const DXT5 *source = (const DXT5*)external.buffer;
2210
2211		for(int z = 0; z < external.depth; z++)
2212		{
2213			unsigned int *dest = destSlice;
2214
2215			for(int y = 0; y < external.height; y += 4)
2216			{
2217				for(int x = 0; x < external.width; x += 4)
2218				{
2219					Color<byte> c[4];
2220
2221					c[0] = source->c0;
2222					c[1] = source->c1;
2223
2224					// c2 = 2 / 3 * c0 + 1 / 3 * c1
2225					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2226					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2227					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2228
2229					// c3 = 1 / 3 * c0 + 2 / 3 * c1
2230					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2231					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2232					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2233
2234					byte a[8];
2235
2236					a[0] = source->a0;
2237					a[1] = source->a1;
2238
2239					if(a[0] > a[1])
2240					{
2241						a[2] = (byte)((6 * (word)a[0] + 1 * (word)a[1] + 3) / 7);
2242						a[3] = (byte)((5 * (word)a[0] + 2 * (word)a[1] + 3) / 7);
2243						a[4] = (byte)((4 * (word)a[0] + 3 * (word)a[1] + 3) / 7);
2244						a[5] = (byte)((3 * (word)a[0] + 4 * (word)a[1] + 3) / 7);
2245						a[6] = (byte)((2 * (word)a[0] + 5 * (word)a[1] + 3) / 7);
2246						a[7] = (byte)((1 * (word)a[0] + 6 * (word)a[1] + 3) / 7);
2247					}
2248					else
2249					{
2250						a[2] = (byte)((4 * (word)a[0] + 1 * (word)a[1] + 2) / 5);
2251						a[3] = (byte)((3 * (word)a[0] + 2 * (word)a[1] + 2) / 5);
2252						a[4] = (byte)((2 * (word)a[0] + 3 * (word)a[1] + 2) / 5);
2253						a[5] = (byte)((1 * (word)a[0] + 4 * (word)a[1] + 2) / 5);
2254						a[6] = 0;
2255						a[7] = 0xFF;
2256					}
2257
2258					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2259					{
2260						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2261						{
2262							unsigned int alpha = (unsigned int)a[(unsigned int)(source->alut >> (16 + 3 * (i + j * 4))) % 8] << 24;
2263							unsigned int color = (c[(source->clut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | alpha;
2264
2265							dest[(x + i) + (y + j) * internal.width] = color;
2266						}
2267					}
2268
2269					source++;
2270				}
2271			}
2272
2273			(byte*&)destSlice += internal.sliceB;
2274		}
2275	}
2276#endif
2277
2278	void Surface::decodeATI1(Buffer &internal, const Buffer &external)
2279	{
2280		byte *destSlice = (byte*)internal.buffer;
2281		const ATI1 *source = (const ATI1*)external.buffer;
2282
2283		for(int z = 0; z < external.depth; z++)
2284		{
2285			byte *dest = destSlice;
2286
2287			for(int y = 0; y < external.height; y += 4)
2288			{
2289				for(int x = 0; x < external.width; x += 4)
2290				{
2291					byte r[8];
2292
2293					r[0] = source->r0;
2294					r[1] = source->r1;
2295
2296					if(r[0] > r[1])
2297					{
2298						r[2] = (byte)((6 * (word)r[0] + 1 * (word)r[1] + 3) / 7);
2299						r[3] = (byte)((5 * (word)r[0] + 2 * (word)r[1] + 3) / 7);
2300						r[4] = (byte)((4 * (word)r[0] + 3 * (word)r[1] + 3) / 7);
2301						r[5] = (byte)((3 * (word)r[0] + 4 * (word)r[1] + 3) / 7);
2302						r[6] = (byte)((2 * (word)r[0] + 5 * (word)r[1] + 3) / 7);
2303						r[7] = (byte)((1 * (word)r[0] + 6 * (word)r[1] + 3) / 7);
2304					}
2305					else
2306					{
2307						r[2] = (byte)((4 * (word)r[0] + 1 * (word)r[1] + 2) / 5);
2308						r[3] = (byte)((3 * (word)r[0] + 2 * (word)r[1] + 2) / 5);
2309						r[4] = (byte)((2 * (word)r[0] + 3 * (word)r[1] + 2) / 5);
2310						r[5] = (byte)((1 * (word)r[0] + 4 * (word)r[1] + 2) / 5);
2311						r[6] = 0;
2312						r[7] = 0xFF;
2313					}
2314
2315					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2316					{
2317						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2318						{
2319							dest[(x + i) + (y + j) * internal.width] = r[(unsigned int)(source->rlut >> (16 + 3 * (i + j * 4))) % 8];
2320						}
2321					}
2322
2323					source++;
2324				}
2325			}
2326
2327			destSlice += internal.sliceB;
2328		}
2329	}
2330
2331	void Surface::decodeATI2(Buffer &internal, const Buffer &external)
2332	{
2333		word *destSlice = (word*)internal.buffer;
2334		const ATI2 *source = (const ATI2*)external.buffer;
2335
2336		for(int z = 0; z < external.depth; z++)
2337		{
2338			word *dest = destSlice;
2339
2340			for(int y = 0; y < external.height; y += 4)
2341			{
2342				for(int x = 0; x < external.width; x += 4)
2343				{
2344					byte X[8];
2345
2346					X[0] = source->x0;
2347					X[1] = source->x1;
2348
2349					if(X[0] > X[1])
2350					{
2351						X[2] = (byte)((6 * (word)X[0] + 1 * (word)X[1] + 3) / 7);
2352						X[3] = (byte)((5 * (word)X[0] + 2 * (word)X[1] + 3) / 7);
2353						X[4] = (byte)((4 * (word)X[0] + 3 * (word)X[1] + 3) / 7);
2354						X[5] = (byte)((3 * (word)X[0] + 4 * (word)X[1] + 3) / 7);
2355						X[6] = (byte)((2 * (word)X[0] + 5 * (word)X[1] + 3) / 7);
2356						X[7] = (byte)((1 * (word)X[0] + 6 * (word)X[1] + 3) / 7);
2357					}
2358					else
2359					{
2360						X[2] = (byte)((4 * (word)X[0] + 1 * (word)X[1] + 2) / 5);
2361						X[3] = (byte)((3 * (word)X[0] + 2 * (word)X[1] + 2) / 5);
2362						X[4] = (byte)((2 * (word)X[0] + 3 * (word)X[1] + 2) / 5);
2363						X[5] = (byte)((1 * (word)X[0] + 4 * (word)X[1] + 2) / 5);
2364						X[6] = 0;
2365						X[7] = 0xFF;
2366					}
2367
2368					byte Y[8];
2369
2370					Y[0] = source->y0;
2371					Y[1] = source->y1;
2372
2373					if(Y[0] > Y[1])
2374					{
2375						Y[2] = (byte)((6 * (word)Y[0] + 1 * (word)Y[1] + 3) / 7);
2376						Y[3] = (byte)((5 * (word)Y[0] + 2 * (word)Y[1] + 3) / 7);
2377						Y[4] = (byte)((4 * (word)Y[0] + 3 * (word)Y[1] + 3) / 7);
2378						Y[5] = (byte)((3 * (word)Y[0] + 4 * (word)Y[1] + 3) / 7);
2379						Y[6] = (byte)((2 * (word)Y[0] + 5 * (word)Y[1] + 3) / 7);
2380						Y[7] = (byte)((1 * (word)Y[0] + 6 * (word)Y[1] + 3) / 7);
2381					}
2382					else
2383					{
2384						Y[2] = (byte)((4 * (word)Y[0] + 1 * (word)Y[1] + 2) / 5);
2385						Y[3] = (byte)((3 * (word)Y[0] + 2 * (word)Y[1] + 2) / 5);
2386						Y[4] = (byte)((2 * (word)Y[0] + 3 * (word)Y[1] + 2) / 5);
2387						Y[5] = (byte)((1 * (word)Y[0] + 4 * (word)Y[1] + 2) / 5);
2388						Y[6] = 0;
2389						Y[7] = 0xFF;
2390					}
2391
2392					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2393					{
2394						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2395						{
2396							word r = X[(unsigned int)(source->xlut >> (16 + 3 * (i + j * 4))) % 8];
2397							word g = Y[(unsigned int)(source->ylut >> (16 + 3 * (i + j * 4))) % 8];
2398
2399							dest[(x + i) + (y + j) * internal.width] = (g << 8) + r;
2400						}
2401					}
2402
2403					source++;
2404				}
2405			}
2406
2407			(byte*&)destSlice += internal.sliceB;
2408		}
2409	}
2410
2411	void Surface::decodeETC2(Buffer &internal, const Buffer &external, int nbAlphaBits, bool isSRGB)
2412	{
2413		ETC_Decoder::Decode((const byte*)external.buffer, (byte*)internal.buffer, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
2414		                    (nbAlphaBits == 8) ? ETC_Decoder::ETC_RGBA : ((nbAlphaBits == 1) ? ETC_Decoder::ETC_RGB_PUNCHTHROUGH_ALPHA : ETC_Decoder::ETC_RGB));
2415
2416		if(isSRGB)
2417		{
2418			static byte sRGBtoLinearTable[256];
2419			static bool sRGBtoLinearTableDirty = true;
2420			if(sRGBtoLinearTableDirty)
2421			{
2422				for(int i = 0; i < 256; i++)
2423				{
2424					sRGBtoLinearTable[i] = static_cast<byte>(sRGBtoLinear(static_cast<float>(i) / 255.0f) * 255.0f + 0.5f);
2425				}
2426				sRGBtoLinearTableDirty = false;
2427			}
2428
2429			// Perform sRGB conversion in place after decoding
2430			byte* src = (byte*)internal.buffer;
2431			for(int y = 0; y < internal.height; y++)
2432			{
2433				byte* srcRow = src + y * internal.pitchB;
2434				for(int x = 0; x <  internal.width; x++)
2435				{
2436					byte* srcPix = srcRow + x * internal.bytes;
2437					for(int i = 0; i < 3; i++)
2438					{
2439						srcPix[i] = sRGBtoLinearTable[srcPix[i]];
2440					}
2441				}
2442			}
2443		}
2444	}
2445
2446	void Surface::decodeEAC(Buffer &internal, const Buffer &external, int nbChannels, bool isSigned)
2447	{
2448		ASSERT(nbChannels == 1 || nbChannels == 2);
2449
2450		ETC_Decoder::Decode((const byte*)external.buffer, (byte*)internal.buffer, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
2451		                    (nbChannels == 1) ? (isSigned ? ETC_Decoder::ETC_R_SIGNED : ETC_Decoder::ETC_R_UNSIGNED) : (isSigned ? ETC_Decoder::ETC_RG_SIGNED : ETC_Decoder::ETC_RG_UNSIGNED));
2452
2453		// FIXME: We convert signed data to float, until signed integer internal formats are supported
2454		//        This code can be removed if signed ETC2 images are decoded to internal 8 bit signed R/RG formats
2455		if(isSigned)
2456		{
2457			sbyte* src = (sbyte*)internal.buffer;
2458
2459			for(int y = 0; y < internal.height; y++)
2460			{
2461				sbyte* srcRow = src + y * internal.pitchB;
2462				for(int x = internal.width - 1; x >= 0; x--)
2463				{
2464					int dx = x & 0xFFFFFFFC;
2465					int mx = x - dx;
2466					sbyte* srcPix = srcRow + dx * internal.bytes + mx * nbChannels;
2467					float* dstPix = (float*)(srcRow + x * internal.bytes);
2468					for(int c = nbChannels - 1; c >= 0; c--)
2469					{
2470						static const float normalization = 1.0f / 127.875f;
2471						dstPix[c] = clamp(static_cast<float>(srcPix[c]) * normalization, -1.0f, 1.0f);
2472					}
2473				}
2474			}
2475		}
2476	}
2477
2478	void Surface::decodeASTC(Buffer &internal, const Buffer &external, int xBlockSize, int yBlockSize, int zBlockSize, bool isSRGB)
2479	{
2480	}
2481
2482	unsigned int Surface::size(int width, int height, int depth, Format format)
2483	{
2484		// Dimensions rounded up to multiples of 4, used for compressed formats
2485		int width4 = align(width, 4);
2486		int height4 = align(height, 4);
2487
2488		switch(format)
2489		{
2490		#if S3TC_SUPPORT
2491		case FORMAT_DXT1:
2492		#endif
2493		case FORMAT_ATI1:
2494		case FORMAT_ETC1:
2495		case FORMAT_R11_EAC:
2496		case FORMAT_SIGNED_R11_EAC:
2497		case FORMAT_RGB8_ETC2:
2498		case FORMAT_SRGB8_ETC2:
2499		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2500		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2501			return width4 * height4 * depth / 2;
2502		#if S3TC_SUPPORT
2503		case FORMAT_DXT3:
2504		case FORMAT_DXT5:
2505		#endif
2506		case FORMAT_ATI2:
2507		case FORMAT_RG11_EAC:
2508		case FORMAT_SIGNED_RG11_EAC:
2509		case FORMAT_RGBA8_ETC2_EAC:
2510		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
2511		case FORMAT_RGBA_ASTC_4x4_KHR:
2512		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
2513			return width4 * height4 * depth;
2514		case FORMAT_RGBA_ASTC_5x4_KHR:
2515		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
2516			return align(width, 5) * height4 * depth;
2517		case FORMAT_RGBA_ASTC_5x5_KHR:
2518		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
2519			return align(width, 5) * align(height, 5) * depth;
2520		case FORMAT_RGBA_ASTC_6x5_KHR:
2521		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
2522			return align(width, 6) * align(height, 5) * depth;
2523		case FORMAT_RGBA_ASTC_6x6_KHR:
2524		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
2525			return align(width, 6) * align(height, 6) * depth;
2526		case FORMAT_RGBA_ASTC_8x5_KHR:
2527		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
2528			return align(width, 8) * align(height, 5) * depth;
2529		case FORMAT_RGBA_ASTC_8x6_KHR:
2530		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
2531			return align(width, 8) * align(height, 6) * depth;
2532		case FORMAT_RGBA_ASTC_8x8_KHR:
2533		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
2534			return align(width, 8) * align(height, 8) * depth;
2535		case FORMAT_RGBA_ASTC_10x5_KHR:
2536		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
2537			return align(width, 10) * align(height, 5) * depth;
2538		case FORMAT_RGBA_ASTC_10x6_KHR:
2539		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
2540			return align(width, 10) * align(height, 6) * depth;
2541		case FORMAT_RGBA_ASTC_10x8_KHR:
2542		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
2543			return align(width, 10) * align(height, 8) * depth;
2544		case FORMAT_RGBA_ASTC_10x10_KHR:
2545		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
2546			return align(width, 10) * align(height, 10) * depth;
2547		case FORMAT_RGBA_ASTC_12x10_KHR:
2548		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
2549			return align(width, 12) * align(height, 10) * depth;
2550		case FORMAT_RGBA_ASTC_12x12_KHR:
2551		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
2552			return align(width, 12) * align(height, 12) * depth;
2553		case FORMAT_YV12_BT601:
2554		case FORMAT_YV12_BT709:
2555		case FORMAT_YV12_JFIF:
2556			{
2557				unsigned int YStride = align(width, 16);
2558				unsigned int YSize = YStride * height;
2559				unsigned int CStride = align(YStride / 2, 16);
2560 				unsigned int CSize = CStride * height / 2;
2561
2562				return YSize + 2 * CSize;
2563			}
2564		default:
2565			return bytes(format) * width * height * depth;
2566		}
2567
2568		return 0;
2569	}
2570
2571	bool Surface::isStencil(Format format)
2572	{
2573		switch(format)
2574		{
2575		case FORMAT_D32:
2576		case FORMAT_D16:
2577		case FORMAT_D24X8:
2578		case FORMAT_D32F:
2579		case FORMAT_D32F_COMPLEMENTARY:
2580		case FORMAT_D32F_LOCKABLE:
2581			return false;
2582		case FORMAT_D24S8:
2583		case FORMAT_D24FS8:
2584		case FORMAT_S8:
2585		case FORMAT_DF24S8:
2586		case FORMAT_DF16S8:
2587		case FORMAT_D32FS8_TEXTURE:
2588		case FORMAT_D32FS8_SHADOW:
2589		case FORMAT_INTZ:
2590			return true;
2591		default:
2592			return false;
2593		}
2594	}
2595
2596	bool Surface::isDepth(Format format)
2597	{
2598		switch(format)
2599		{
2600		case FORMAT_D32:
2601		case FORMAT_D16:
2602		case FORMAT_D24X8:
2603		case FORMAT_D24S8:
2604		case FORMAT_D24FS8:
2605		case FORMAT_D32F:
2606		case FORMAT_D32F_COMPLEMENTARY:
2607		case FORMAT_D32F_LOCKABLE:
2608		case FORMAT_DF24S8:
2609		case FORMAT_DF16S8:
2610		case FORMAT_D32FS8_TEXTURE:
2611		case FORMAT_D32FS8_SHADOW:
2612		case FORMAT_INTZ:
2613			return true;
2614		case FORMAT_S8:
2615			return false;
2616		default:
2617			return false;
2618		}
2619	}
2620
2621	bool Surface::isPalette(Format format)
2622	{
2623		switch(format)
2624		{
2625		case FORMAT_P8:
2626		case FORMAT_A8P8:
2627			return true;
2628		default:
2629			return false;
2630		}
2631	}
2632
2633	bool Surface::isFloatFormat(Format format)
2634	{
2635		switch(format)
2636		{
2637		case FORMAT_R5G6B5:
2638		case FORMAT_X8R8G8B8:
2639		case FORMAT_X8B8G8R8I:
2640		case FORMAT_X8B8G8R8:
2641		case FORMAT_A8R8G8B8:
2642		case FORMAT_A8B8G8R8I:
2643		case FORMAT_R8UI:
2644		case FORMAT_G8R8UI:
2645		case FORMAT_X8B8G8R8UI:
2646		case FORMAT_A8B8G8R8UI:
2647		case FORMAT_A8B8G8R8:
2648		case FORMAT_G8R8I:
2649		case FORMAT_G8R8:
2650		case FORMAT_R8I_SNORM:
2651		case FORMAT_G8R8I_SNORM:
2652		case FORMAT_X8B8G8R8I_SNORM:
2653		case FORMAT_A8B8G8R8I_SNORM:
2654		case FORMAT_R16I:
2655		case FORMAT_R16UI:
2656		case FORMAT_G16R16I:
2657		case FORMAT_G16R16UI:
2658		case FORMAT_G16R16:
2659		case FORMAT_X16B16G16R16I:
2660		case FORMAT_X16B16G16R16UI:
2661		case FORMAT_A16B16G16R16I:
2662		case FORMAT_A16B16G16R16UI:
2663		case FORMAT_A16B16G16R16:
2664		case FORMAT_V8U8:
2665		case FORMAT_Q8W8V8U8:
2666		case FORMAT_X8L8V8U8:
2667		case FORMAT_V16U16:
2668		case FORMAT_A16W16V16U16:
2669		case FORMAT_Q16W16V16U16:
2670		case FORMAT_A8:
2671		case FORMAT_R8I:
2672		case FORMAT_R8:
2673		case FORMAT_L8:
2674		case FORMAT_L16:
2675		case FORMAT_A8L8:
2676		case FORMAT_YV12_BT601:
2677		case FORMAT_YV12_BT709:
2678		case FORMAT_YV12_JFIF:
2679		case FORMAT_R32I:
2680		case FORMAT_R32UI:
2681		case FORMAT_G32R32I:
2682		case FORMAT_G32R32UI:
2683		case FORMAT_X32B32G32R32I:
2684		case FORMAT_X32B32G32R32UI:
2685		case FORMAT_A32B32G32R32I:
2686		case FORMAT_A32B32G32R32UI:
2687			return false;
2688		case FORMAT_R32F:
2689		case FORMAT_G32R32F:
2690		case FORMAT_A32B32G32R32F:
2691		case FORMAT_D32F:
2692		case FORMAT_D32F_COMPLEMENTARY:
2693		case FORMAT_D32F_LOCKABLE:
2694		case FORMAT_D32FS8_TEXTURE:
2695		case FORMAT_D32FS8_SHADOW:
2696		case FORMAT_L16F:
2697		case FORMAT_A16L16F:
2698		case FORMAT_L32F:
2699		case FORMAT_A32L32F:
2700			return true;
2701		default:
2702			ASSERT(false);
2703		}
2704
2705		return false;
2706	}
2707
2708	bool Surface::isUnsignedComponent(Format format, int component)
2709	{
2710		switch(format)
2711		{
2712		case FORMAT_NULL:
2713		case FORMAT_R5G6B5:
2714		case FORMAT_X8R8G8B8:
2715		case FORMAT_X8B8G8R8:
2716		case FORMAT_A8R8G8B8:
2717		case FORMAT_A8B8G8R8:
2718		case FORMAT_G8R8:
2719		case FORMAT_R16UI:
2720		case FORMAT_G16R16:
2721		case FORMAT_G16R16UI:
2722		case FORMAT_X16B16G16R16UI:
2723		case FORMAT_A16B16G16R16:
2724		case FORMAT_A16B16G16R16UI:
2725		case FORMAT_R32UI:
2726		case FORMAT_G32R32UI:
2727		case FORMAT_X32B32G32R32UI:
2728		case FORMAT_A32B32G32R32UI:
2729		case FORMAT_R8UI:
2730		case FORMAT_G8R8UI:
2731		case FORMAT_X8B8G8R8UI:
2732		case FORMAT_A8B8G8R8UI:
2733		case FORMAT_D32F:
2734		case FORMAT_D32F_COMPLEMENTARY:
2735		case FORMAT_D32F_LOCKABLE:
2736		case FORMAT_D32FS8_TEXTURE:
2737		case FORMAT_D32FS8_SHADOW:
2738		case FORMAT_A8:
2739		case FORMAT_R8:
2740		case FORMAT_L8:
2741		case FORMAT_L16:
2742		case FORMAT_A8L8:
2743		case FORMAT_YV12_BT601:
2744		case FORMAT_YV12_BT709:
2745		case FORMAT_YV12_JFIF:
2746			return true;
2747		case FORMAT_A8B8G8R8I:
2748		case FORMAT_A16B16G16R16I:
2749		case FORMAT_A32B32G32R32I:
2750		case FORMAT_A8B8G8R8I_SNORM:
2751		case FORMAT_Q8W8V8U8:
2752		case FORMAT_Q16W16V16U16:
2753		case FORMAT_A32B32G32R32F:
2754			return false;
2755		case FORMAT_R32F:
2756		case FORMAT_R8I:
2757		case FORMAT_R16I:
2758		case FORMAT_R32I:
2759		case FORMAT_R8I_SNORM:
2760			return component >= 1;
2761		case FORMAT_V8U8:
2762		case FORMAT_X8L8V8U8:
2763		case FORMAT_V16U16:
2764		case FORMAT_G32R32F:
2765		case FORMAT_G8R8I:
2766		case FORMAT_G16R16I:
2767		case FORMAT_G32R32I:
2768		case FORMAT_G8R8I_SNORM:
2769			return component >= 2;
2770		case FORMAT_A16W16V16U16:
2771		case FORMAT_X8B8G8R8I:
2772		case FORMAT_X16B16G16R16I:
2773		case FORMAT_X32B32G32R32I:
2774		case FORMAT_X8B8G8R8I_SNORM:
2775			return component >= 3;
2776		default:
2777			ASSERT(false);
2778		}
2779
2780		return false;
2781	}
2782
2783	bool Surface::isSRGBreadable(Format format)
2784	{
2785		// Keep in sync with Capabilities::isSRGBreadable
2786		switch(format)
2787		{
2788		case FORMAT_L8:
2789		case FORMAT_A8L8:
2790		case FORMAT_R8G8B8:
2791		case FORMAT_A8R8G8B8:
2792		case FORMAT_X8R8G8B8:
2793		case FORMAT_A8B8G8R8:
2794		case FORMAT_X8B8G8R8:
2795		case FORMAT_R5G6B5:
2796		case FORMAT_X1R5G5B5:
2797		case FORMAT_A1R5G5B5:
2798		case FORMAT_A4R4G4B4:
2799		#if S3TC_SUPPORT
2800		case FORMAT_DXT1:
2801		case FORMAT_DXT3:
2802		case FORMAT_DXT5:
2803		#endif
2804		case FORMAT_ATI1:
2805		case FORMAT_ATI2:
2806			return true;
2807		default:
2808			return false;
2809		}
2810
2811		return false;
2812	}
2813
2814	bool Surface::isSRGBwritable(Format format)
2815	{
2816		// Keep in sync with Capabilities::isSRGBwritable
2817		switch(format)
2818		{
2819		case FORMAT_NULL:
2820		case FORMAT_A8R8G8B8:
2821		case FORMAT_X8R8G8B8:
2822		case FORMAT_A8B8G8R8:
2823		case FORMAT_X8B8G8R8:
2824		case FORMAT_R5G6B5:
2825			return true;
2826		default:
2827			return false;
2828		}
2829	}
2830
2831	bool Surface::isCompressed(Format format)
2832	{
2833		switch(format)
2834		{
2835		#if S3TC_SUPPORT
2836		case FORMAT_DXT1:
2837		case FORMAT_DXT3:
2838		case FORMAT_DXT5:
2839		#endif
2840		case FORMAT_ATI1:
2841		case FORMAT_ATI2:
2842		case FORMAT_ETC1:
2843		case FORMAT_R11_EAC:
2844		case FORMAT_SIGNED_R11_EAC:
2845		case FORMAT_RG11_EAC:
2846		case FORMAT_SIGNED_RG11_EAC:
2847		case FORMAT_RGB8_ETC2:
2848		case FORMAT_SRGB8_ETC2:
2849		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2850		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2851		case FORMAT_RGBA8_ETC2_EAC:
2852		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
2853		case FORMAT_RGBA_ASTC_4x4_KHR:
2854		case FORMAT_RGBA_ASTC_5x4_KHR:
2855		case FORMAT_RGBA_ASTC_5x5_KHR:
2856		case FORMAT_RGBA_ASTC_6x5_KHR:
2857		case FORMAT_RGBA_ASTC_6x6_KHR:
2858		case FORMAT_RGBA_ASTC_8x5_KHR:
2859		case FORMAT_RGBA_ASTC_8x6_KHR:
2860		case FORMAT_RGBA_ASTC_8x8_KHR:
2861		case FORMAT_RGBA_ASTC_10x5_KHR:
2862		case FORMAT_RGBA_ASTC_10x6_KHR:
2863		case FORMAT_RGBA_ASTC_10x8_KHR:
2864		case FORMAT_RGBA_ASTC_10x10_KHR:
2865		case FORMAT_RGBA_ASTC_12x10_KHR:
2866		case FORMAT_RGBA_ASTC_12x12_KHR:
2867		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
2868		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
2869		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
2870		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
2871		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
2872		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
2873		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
2874		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
2875		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
2876		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
2877		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
2878		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
2879		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
2880		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
2881			return true;
2882		default:
2883			return false;
2884		}
2885	}
2886
2887	bool Surface::isNonNormalizedInteger(Format format)
2888	{
2889		switch(format)
2890		{
2891		case FORMAT_A8B8G8R8I:
2892		case FORMAT_X8B8G8R8I:
2893		case FORMAT_G8R8I:
2894		case FORMAT_R8I:
2895		case FORMAT_A8B8G8R8UI:
2896		case FORMAT_X8B8G8R8UI:
2897		case FORMAT_G8R8UI:
2898		case FORMAT_R8UI:
2899		case FORMAT_A16B16G16R16I:
2900		case FORMAT_X16B16G16R16I:
2901		case FORMAT_G16R16I:
2902		case FORMAT_R16I:
2903		case FORMAT_A16B16G16R16UI:
2904		case FORMAT_X16B16G16R16UI:
2905		case FORMAT_G16R16UI:
2906		case FORMAT_R16UI:
2907		case FORMAT_A32B32G32R32I:
2908		case FORMAT_X32B32G32R32I:
2909		case FORMAT_G32R32I:
2910		case FORMAT_R32I:
2911		case FORMAT_A32B32G32R32UI:
2912		case FORMAT_X32B32G32R32UI:
2913		case FORMAT_G32R32UI:
2914		case FORMAT_R32UI:
2915			return true;
2916		default:
2917			return false;
2918		}
2919	}
2920
2921	int Surface::componentCount(Format format)
2922	{
2923		switch(format)
2924		{
2925		case FORMAT_R5G6B5:         return 3;
2926		case FORMAT_X8R8G8B8:       return 3;
2927		case FORMAT_X8B8G8R8I:      return 3;
2928		case FORMAT_X8B8G8R8:       return 3;
2929		case FORMAT_A8R8G8B8:       return 4;
2930		case FORMAT_A8B8G8R8I:      return 4;
2931		case FORMAT_A8B8G8R8:       return 4;
2932		case FORMAT_G8R8I:          return 2;
2933		case FORMAT_G8R8:           return 2;
2934		case FORMAT_R8I_SNORM:      return 1;
2935		case FORMAT_G8R8I_SNORM:    return 2;
2936		case FORMAT_X8B8G8R8I_SNORM:return 3;
2937		case FORMAT_A8B8G8R8I_SNORM:return 4;
2938		case FORMAT_R8UI:           return 1;
2939		case FORMAT_G8R8UI:         return 2;
2940		case FORMAT_X8B8G8R8UI:     return 3;
2941		case FORMAT_A8B8G8R8UI:     return 4;
2942		case FORMAT_G16R16I:        return 2;
2943		case FORMAT_G16R16UI:       return 2;
2944		case FORMAT_G16R16:         return 2;
2945		case FORMAT_G32R32I:        return 2;
2946		case FORMAT_G32R32UI:       return 2;
2947		case FORMAT_X16B16G16R16I:  return 3;
2948		case FORMAT_X16B16G16R16UI: return 3;
2949		case FORMAT_A16B16G16R16I:  return 4;
2950		case FORMAT_A16B16G16R16UI: return 4;
2951		case FORMAT_A16B16G16R16:   return 4;
2952		case FORMAT_X32B32G32R32I:  return 3;
2953		case FORMAT_X32B32G32R32UI: return 3;
2954		case FORMAT_A32B32G32R32I:  return 4;
2955		case FORMAT_A32B32G32R32UI: return 4;
2956		case FORMAT_V8U8:           return 2;
2957		case FORMAT_Q8W8V8U8:       return 4;
2958		case FORMAT_X8L8V8U8:       return 3;
2959		case FORMAT_V16U16:         return 2;
2960		case FORMAT_A16W16V16U16:   return 4;
2961		case FORMAT_Q16W16V16U16:   return 4;
2962		case FORMAT_R32F:           return 1;
2963		case FORMAT_G32R32F:        return 2;
2964		case FORMAT_A32B32G32R32F:  return 4;
2965		case FORMAT_D32F:           return 1;
2966		case FORMAT_D32F_LOCKABLE:  return 1;
2967		case FORMAT_D32FS8_TEXTURE: return 1;
2968		case FORMAT_D32FS8_SHADOW:  return 1;
2969		case FORMAT_A8:             return 1;
2970		case FORMAT_R8I:            return 1;
2971		case FORMAT_R8:             return 1;
2972		case FORMAT_R16I:           return 1;
2973		case FORMAT_R16UI:          return 1;
2974		case FORMAT_R32I:           return 1;
2975		case FORMAT_R32UI:          return 1;
2976		case FORMAT_L8:             return 1;
2977		case FORMAT_L16:            return 1;
2978		case FORMAT_A8L8:           return 2;
2979		case FORMAT_YV12_BT601:     return 3;
2980		case FORMAT_YV12_BT709:     return 3;
2981		case FORMAT_YV12_JFIF:      return 3;
2982		default:
2983			ASSERT(false);
2984		}
2985
2986		return 1;
2987	}
2988
2989	void *Surface::allocateBuffer(int width, int height, int depth, Format format)
2990	{
2991		// Render targets require 2x2 quads
2992		int width2 = (width + 1) & ~1;
2993		int height2 = (height + 1) & ~1;
2994
2995		// FIXME: Unpacking byte4 to short4 in the sampler currently involves reading 8 bytes,
2996		// so we have to allocate 4 extra bytes to avoid buffer overruns.
2997		return allocateZero(size(width2, height2, depth, format) + 4);
2998	}
2999
3000	void Surface::memfill4(void *buffer, int pattern, int bytes)
3001	{
3002		while((size_t)buffer & 0x1 && bytes >= 1)
3003		{
3004			*(char*)buffer = (char)pattern;
3005			(char*&)buffer += 1;
3006			bytes -= 1;
3007		}
3008
3009		while((size_t)buffer & 0x3 && bytes >= 2)
3010		{
3011			*(short*)buffer = (short)pattern;
3012			(short*&)buffer += 1;
3013			bytes -= 2;
3014		}
3015
3016		if(CPUID::supportsSSE())
3017		{
3018			while((size_t)buffer & 0xF && bytes >= 4)
3019			{
3020				*(int*)buffer = pattern;
3021				(int*&)buffer += 1;
3022				bytes -= 4;
3023			}
3024
3025			__m128 quad = _mm_set_ps1((float&)pattern);
3026
3027			float *pointer = (float*)buffer;
3028			int qxwords = bytes / 64;
3029			bytes -= qxwords * 64;
3030
3031			while(qxwords--)
3032			{
3033				_mm_stream_ps(pointer + 0, quad);
3034				_mm_stream_ps(pointer + 4, quad);
3035				_mm_stream_ps(pointer + 8, quad);
3036				_mm_stream_ps(pointer + 12, quad);
3037
3038				pointer += 16;
3039			}
3040
3041			buffer = pointer;
3042		}
3043
3044		while(bytes >= 4)
3045		{
3046			*(int*)buffer = (int)pattern;
3047			(int*&)buffer += 1;
3048			bytes -= 4;
3049		}
3050
3051		while(bytes >= 2)
3052		{
3053			*(short*)buffer = (short)pattern;
3054			(short*&)buffer += 1;
3055			bytes -= 2;
3056		}
3057
3058		while(bytes >= 1)
3059		{
3060			*(char*)buffer = (char)pattern;
3061			(char*&)buffer += 1;
3062			bytes -= 1;
3063		}
3064	}
3065
3066	bool Surface::isEntire(const SliceRect& rect) const
3067	{
3068		return (rect.x0 == 0 && rect.y0 == 0 && rect.x1 == internal.width && rect.y1 == internal.height && internal.depth == 1);
3069	}
3070
3071	bool Surface::getClearRect(int x0, int y0, int width, int height, SliceRect& rect) const
3072	{
3073		// Not overlapping
3074		if(x0 > internal.width) return false;
3075		if(y0 > internal.height) return false;
3076		if(x0 + width < 0) return false;
3077		if(y0 + height < 0) return false;
3078
3079		// Clip against dimensions
3080		if(x0 < 0) { width += x0; x0 = 0; }
3081		if(x0 + width > internal.width) width = internal.width - x0;
3082		if(y0 < 0) { height += y0; y0 = 0; }
3083		if(y0 + height > internal.height) height = internal.height - y0;
3084
3085		rect.x0 = x0;
3086		rect.x1 = x0 + width;
3087		rect.y0 = y0;
3088		rect.y1 = y0 + height;
3089
3090		return true;
3091	}
3092
3093	void Surface::clearDepthBuffer(float depth, int x0, int y0, int width, int height)
3094	{
3095		// Not overlapping
3096		if(x0 > internal.width) return;
3097		if(y0 > internal.height) return;
3098		if(x0 + width < 0) return;
3099		if(y0 + height < 0) return;
3100
3101		// Clip against dimensions
3102		if(x0 < 0) {width += x0; x0 = 0;}
3103		if(x0 + width > internal.width) width = internal.width - x0;
3104		if(y0 < 0) {height += y0; y0 = 0;}
3105		if(y0 + height > internal.height) height = internal.height - y0;
3106
3107		const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
3108		const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
3109
3110		int width2 = (internal.width + 1) & ~1;
3111
3112		int x1 = x0 + width;
3113		int y1 = y0 + height;
3114
3115		if(internal.format == FORMAT_D32F_LOCKABLE ||
3116		   internal.format == FORMAT_D32FS8_TEXTURE ||
3117		   internal.format == FORMAT_D32FS8_SHADOW)
3118		{
3119			float *target = (float*)lockInternal(0, 0, 0, lock, PUBLIC) + x0 + width2 * y0;
3120
3121			for(int z = 0; z < internal.depth; z++)
3122			{
3123				for(int y = y0; y < y1; y++)
3124				{
3125					memfill4(target, (int&)depth, 4 * width);
3126					target += width2;
3127				}
3128			}
3129
3130			unlockInternal();
3131		}
3132		else   // Quad layout
3133		{
3134			if(complementaryDepthBuffer)
3135			{
3136				depth = 1 - depth;
3137			}
3138
3139			float *buffer = (float*)lockInternal(0, 0, 0, lock, PUBLIC);
3140
3141			for(int z = 0; z < internal.depth; z++)
3142			{
3143				for(int y = y0; y < y1; y++)
3144				{
3145					float *target = buffer + (y & ~1) * width2 + (y & 1) * 2;
3146
3147					if((y & 1) == 0 && y + 1 < y1)   // Fill quad line at once
3148					{
3149						if((x0 & 1) != 0)
3150						{
3151							target[(x0 & ~1) * 2 + 1] = depth;
3152							target[(x0 & ~1) * 2 + 3] = depth;
3153						}
3154
3155					//	for(int x2 = ((x0 + 1) & ~1) * 2; x2 < x1 * 2; x2 += 4)
3156					//	{
3157					//		target[x2 + 0] = depth;
3158					//		target[x2 + 1] = depth;
3159					//		target[x2 + 2] = depth;
3160					//		target[x2 + 3] = depth;
3161					//	}
3162
3163					//	__asm
3164					//	{
3165					//		movss xmm0, depth
3166					//		shufps xmm0, xmm0, 0x00
3167					//
3168					//		mov eax, x0
3169					//		add eax, 1
3170					//		and eax, 0xFFFFFFFE
3171					//		cmp eax, x1
3172					//		jge qEnd
3173					//
3174					//		mov edi, target
3175					//
3176					//	qLoop:
3177					//		movntps [edi+8*eax], xmm0
3178					//
3179					//		add eax, 2
3180					//		cmp eax, x1
3181					//		jl qLoop
3182					//	qEnd:
3183					//	}
3184
3185						memfill4(&target[((x0 + 1) & ~1) * 2], (int&)depth, 8 * ((x1 & ~1) - ((x0 + 1) & ~1)));
3186
3187						if((x1 & 1) != 0)
3188						{
3189							target[(x1 & ~1) * 2 + 0] = depth;
3190							target[(x1 & ~1) * 2 + 2] = depth;
3191						}
3192
3193						y++;
3194					}
3195					else
3196					{
3197						for(int x = x0; x < x1; x++)
3198						{
3199							target[(x & ~1) * 2 + (x & 1)] = depth;
3200						}
3201					}
3202				}
3203
3204				buffer += internal.sliceP;
3205			}
3206
3207			unlockInternal();
3208		}
3209	}
3210
3211	void Surface::clearStencilBuffer(unsigned char s, unsigned char mask, int x0, int y0, int width, int height)
3212	{
3213		// Not overlapping
3214		if(x0 > internal.width) return;
3215		if(y0 > internal.height) return;
3216		if(x0 + width < 0) return;
3217		if(y0 + height < 0) return;
3218
3219		// Clip against dimensions
3220		if(x0 < 0) {width += x0; x0 = 0;}
3221		if(x0 + width > internal.width) width = internal.width - x0;
3222		if(y0 < 0) {height += y0; y0 = 0;}
3223		if(y0 + height > internal.height) height = internal.height - y0;
3224
3225		int width2 = (internal.width + 1) & ~1;
3226
3227		int x1 = x0 + width;
3228		int y1 = y0 + height;
3229
3230		unsigned char maskedS = s & mask;
3231		unsigned char invMask = ~mask;
3232		unsigned int fill = maskedS;
3233		fill = fill | (fill << 8) | (fill << 16) + (fill << 24);
3234
3235		if(false)
3236		{
3237			char *target = (char*)lockStencil(0, PUBLIC) + x0 + width2 * y0;
3238
3239			for(int z = 0; z < stencil.depth; z++)
3240			{
3241				for(int y = y0; y < y0 + height; y++)
3242				{
3243					if(mask == 0xFF)
3244					{
3245						memfill4(target, fill, width);
3246					}
3247					else
3248					{
3249						for(int x = 0; x < width; x++)
3250						{
3251							target[x] = maskedS | (target[x] & invMask);
3252						}
3253					}
3254
3255					target += width2;
3256				}
3257			}
3258
3259			unlockStencil();
3260		}
3261		else   // Quad layout
3262		{
3263			char *buffer = (char*)lockStencil(0, PUBLIC);
3264
3265			if(mask == 0xFF)
3266			{
3267				for(int z = 0; z < stencil.depth; z++)
3268				{
3269					for(int y = y0; y < y1; y++)
3270					{
3271						char *target = buffer + (y & ~1) * width2 + (y & 1) * 2;
3272
3273						if((y & 1) == 0 && y + 1 < y1 && mask == 0xFF)   // Fill quad line at once
3274						{
3275							if((x0 & 1) != 0)
3276							{
3277								target[(x0 & ~1) * 2 + 1] = fill;
3278								target[(x0 & ~1) * 2 + 3] = fill;
3279							}
3280
3281							memfill4(&target[((x0 + 1) & ~1) * 2], fill, ((x1 + 1) & ~1) * 2 - ((x0 + 1) & ~1) * 2);
3282
3283							if((x1 & 1) != 0)
3284							{
3285								target[(x1 & ~1) * 2 + 0] = fill;
3286								target[(x1 & ~1) * 2 + 2] = fill;
3287							}
3288
3289							y++;
3290						}
3291						else
3292						{
3293							for(int x = x0; x < x1; x++)
3294							{
3295								target[(x & ~1) * 2 + (x & 1)] = maskedS | (target[x] & invMask);
3296							}
3297						}
3298					}
3299
3300					buffer += stencil.sliceP;
3301				}
3302			}
3303
3304			unlockStencil();
3305		}
3306	}
3307
3308	void Surface::fill(const Color<float> &color, int x0, int y0, int width, int height)
3309	{
3310		unsigned char *row;
3311		Buffer *buffer;
3312
3313		if(internal.dirty)
3314		{
3315			row = (unsigned char*)lockInternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3316			buffer = &internal;
3317		}
3318		else
3319		{
3320			row = (unsigned char*)lockExternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3321			buffer = &external;
3322		}
3323
3324		if(buffer->bytes <= 4)
3325		{
3326			int c;
3327			buffer->write(&c, color);
3328
3329			if(buffer->bytes <= 1) c = (c << 8)  | c;
3330			if(buffer->bytes <= 2) c = (c << 16) | c;
3331
3332			for(int y = 0; y < height; y++)
3333			{
3334				memfill4(row, c, width * buffer->bytes);
3335
3336				row += buffer->pitchB;
3337			}
3338		}
3339		else   // Generic
3340		{
3341			for(int y = 0; y < height; y++)
3342			{
3343				unsigned char *element = row;
3344
3345				for(int x = 0; x < width; x++)
3346				{
3347					buffer->write(element, color);
3348
3349					element += buffer->bytes;
3350				}
3351
3352				row += buffer->pitchB;
3353			}
3354		}
3355
3356		if(buffer == &internal)
3357		{
3358			unlockInternal();
3359		}
3360		else
3361		{
3362			unlockExternal();
3363		}
3364	}
3365
3366	void Surface::copyInternal(const Surface* source, int x, int y, float srcX, float srcY, bool filter)
3367	{
3368		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
3369
3370		sw::Color<float> color;
3371
3372		if(!filter)
3373		{
3374			color = source->internal.read((int)srcX, (int)srcY);
3375		}
3376		else   // Bilinear filtering
3377		{
3378			color = source->internal.sample(srcX, srcY);
3379		}
3380
3381		internal.write(x, y, color);
3382	}
3383
3384	void Surface::copyInternal(const Surface* source, int x, int y, int z, float srcX, float srcY, float srcZ, bool filter)
3385	{
3386		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
3387
3388		sw::Color<float> color;
3389
3390		if(!filter)
3391		{
3392			color = source->internal.read((int)srcX, (int)srcY, int(srcZ));
3393		}
3394		else   // Bilinear filtering
3395		{
3396			color = source->internal.sample(srcX, srcY, srcZ);
3397		}
3398
3399		internal.write(x, y, z, color);
3400	}
3401
3402	bool Surface::hasStencil() const
3403	{
3404		return isStencil(external.format);
3405	}
3406
3407	bool Surface::hasDepth() const
3408	{
3409		return isDepth(external.format);
3410	}
3411
3412	bool Surface::hasPalette() const
3413	{
3414		return isPalette(external.format);
3415	}
3416
3417	bool Surface::isRenderTarget() const
3418	{
3419		return renderTarget;
3420	}
3421
3422	bool Surface::hasDirtyMipmaps() const
3423	{
3424		return dirtyMipmaps;
3425	}
3426
3427	void Surface::cleanMipmaps()
3428	{
3429		dirtyMipmaps = false;
3430	}
3431
3432	Resource *Surface::getResource()
3433	{
3434		return resource;
3435	}
3436
3437	bool Surface::identicalFormats() const
3438	{
3439		return external.format == internal.format &&
3440		       external.width  == internal.width &&
3441		       external.height == internal.height &&
3442		       external.depth  == internal.depth &&
3443		       external.pitchB == internal.pitchB &&
3444		       external.sliceB == internal.sliceB;
3445	}
3446
3447	Format Surface::selectInternalFormat(Format format) const
3448	{
3449		switch(format)
3450		{
3451		case FORMAT_NULL:
3452			return FORMAT_NULL;
3453		case FORMAT_P8:
3454		case FORMAT_A8P8:
3455		case FORMAT_A4R4G4B4:
3456		case FORMAT_A1R5G5B5:
3457		case FORMAT_A8R3G3B2:
3458			return FORMAT_A8R8G8B8;
3459		case FORMAT_A8:
3460			return FORMAT_A8;
3461		case FORMAT_R8I:
3462			return FORMAT_R8I;
3463		case FORMAT_R8UI:
3464			return FORMAT_R8UI;
3465		case FORMAT_R8I_SNORM:
3466			return FORMAT_R8I_SNORM;
3467		case FORMAT_R8:
3468			return FORMAT_R8;
3469		case FORMAT_R16I:
3470			return FORMAT_R16I;
3471		case FORMAT_R16UI:
3472			return FORMAT_R16UI;
3473		case FORMAT_R32I:
3474			return FORMAT_R32I;
3475		case FORMAT_R32UI:
3476			return FORMAT_R32UI;
3477		case FORMAT_A2R10G10B10:
3478		case FORMAT_A2B10G10R10:
3479		case FORMAT_X16B16G16R16I:
3480		case FORMAT_A16B16G16R16I:
3481			return FORMAT_A16B16G16R16I;
3482		case FORMAT_X16B16G16R16UI:
3483		case FORMAT_A16B16G16R16UI:
3484			return FORMAT_A16B16G16R16UI;
3485		case FORMAT_A16B16G16R16:
3486			return FORMAT_A16B16G16R16;
3487		case FORMAT_X32B32G32R32I:
3488		case FORMAT_A32B32G32R32I:
3489			return FORMAT_A32B32G32R32I;
3490		case FORMAT_X32B32G32R32UI:
3491		case FORMAT_A32B32G32R32UI:
3492			return FORMAT_A32B32G32R32UI;
3493		case FORMAT_G8R8I:
3494			return FORMAT_G8R8I;
3495		case FORMAT_G8R8UI:
3496			return FORMAT_G8R8UI;
3497		case FORMAT_G8R8I_SNORM:
3498			return FORMAT_G8R8I_SNORM;
3499		case FORMAT_G8R8:
3500			return FORMAT_G8R8;
3501		case FORMAT_G16R16I:
3502			return FORMAT_G16R16I;
3503		case FORMAT_G16R16UI:
3504			return FORMAT_G16R16UI;
3505		case FORMAT_G16R16:
3506			return FORMAT_G16R16;
3507		case FORMAT_G32R32I:
3508			return FORMAT_G32R32I;
3509		case FORMAT_G32R32UI:
3510			return FORMAT_G32R32UI;
3511		case FORMAT_A8R8G8B8:
3512			if(lockable || !quadLayoutEnabled)
3513			{
3514				return FORMAT_A8R8G8B8;
3515			}
3516			else
3517			{
3518				return FORMAT_A8G8R8B8Q;
3519			}
3520		case FORMAT_A8B8G8R8I:
3521			return FORMAT_A8B8G8R8I;
3522		case FORMAT_A8B8G8R8UI:
3523			return FORMAT_A8B8G8R8UI;
3524		case FORMAT_A8B8G8R8I_SNORM:
3525			return FORMAT_A8B8G8R8I_SNORM;
3526		case FORMAT_R5G5B5A1:
3527		case FORMAT_R4G4B4A4:
3528		case FORMAT_A8B8G8R8:
3529			return FORMAT_A8B8G8R8;
3530		case FORMAT_R5G6B5:
3531			return FORMAT_R5G6B5;
3532		case FORMAT_R3G3B2:
3533		case FORMAT_R8G8B8:
3534		case FORMAT_X4R4G4B4:
3535		case FORMAT_X1R5G5B5:
3536		case FORMAT_X8R8G8B8:
3537			if(lockable || !quadLayoutEnabled)
3538			{
3539				return FORMAT_X8R8G8B8;
3540			}
3541			else
3542			{
3543				return FORMAT_X8G8R8B8Q;
3544			}
3545		case FORMAT_X8B8G8R8I:
3546			return FORMAT_X8B8G8R8I;
3547		case FORMAT_X8B8G8R8UI:
3548			return FORMAT_X8B8G8R8UI;
3549		case FORMAT_X8B8G8R8I_SNORM:
3550			return FORMAT_X8B8G8R8I_SNORM;
3551		case FORMAT_B8G8R8:
3552		case FORMAT_X8B8G8R8:
3553			return FORMAT_X8B8G8R8;
3554		// Compressed formats
3555		#if S3TC_SUPPORT
3556		case FORMAT_DXT1:
3557		case FORMAT_DXT3:
3558		case FORMAT_DXT5:
3559		#endif
3560		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3561		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3562		case FORMAT_RGBA8_ETC2_EAC:
3563		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
3564		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
3565		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
3566		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
3567		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
3568		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
3569		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
3570		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
3571		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
3572		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
3573		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
3574		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
3575		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
3576		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
3577		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
3578			return FORMAT_A8R8G8B8;
3579		case FORMAT_RGBA_ASTC_4x4_KHR:
3580		case FORMAT_RGBA_ASTC_5x4_KHR:
3581		case FORMAT_RGBA_ASTC_5x5_KHR:
3582		case FORMAT_RGBA_ASTC_6x5_KHR:
3583		case FORMAT_RGBA_ASTC_6x6_KHR:
3584		case FORMAT_RGBA_ASTC_8x5_KHR:
3585		case FORMAT_RGBA_ASTC_8x6_KHR:
3586		case FORMAT_RGBA_ASTC_8x8_KHR:
3587		case FORMAT_RGBA_ASTC_10x5_KHR:
3588		case FORMAT_RGBA_ASTC_10x6_KHR:
3589		case FORMAT_RGBA_ASTC_10x8_KHR:
3590		case FORMAT_RGBA_ASTC_10x10_KHR:
3591		case FORMAT_RGBA_ASTC_12x10_KHR:
3592		case FORMAT_RGBA_ASTC_12x12_KHR:
3593			// ASTC supports HDR, so a floating point format is required to represent it properly
3594			return FORMAT_A32B32G32R32F; // FIXME: 16FP is probably sufficient, but it's currently unsupported
3595		case FORMAT_ATI1:
3596		case FORMAT_R11_EAC:
3597			return FORMAT_R8;
3598		case FORMAT_SIGNED_R11_EAC:
3599			return FORMAT_R32F; // FIXME: Signed 8bit format would be sufficient
3600		case FORMAT_ATI2:
3601		case FORMAT_RG11_EAC:
3602			return FORMAT_G8R8;
3603		case FORMAT_SIGNED_RG11_EAC:
3604			return FORMAT_G32R32F; // FIXME: Signed 8bit format would be sufficient
3605		case FORMAT_ETC1:
3606		case FORMAT_RGB8_ETC2:
3607		case FORMAT_SRGB8_ETC2:
3608			return FORMAT_X8R8G8B8;
3609		// Bumpmap formats
3610		case FORMAT_V8U8:			return FORMAT_V8U8;
3611		case FORMAT_L6V5U5:			return FORMAT_X8L8V8U8;
3612		case FORMAT_Q8W8V8U8:		return FORMAT_Q8W8V8U8;
3613		case FORMAT_X8L8V8U8:		return FORMAT_X8L8V8U8;
3614		case FORMAT_V16U16:			return FORMAT_V16U16;
3615		case FORMAT_A2W10V10U10:	return FORMAT_A16W16V16U16;
3616		case FORMAT_Q16W16V16U16:	return FORMAT_Q16W16V16U16;
3617		// Floating-point formats
3618		case FORMAT_A16F:			return FORMAT_A32B32G32R32F;
3619		case FORMAT_R16F:			return FORMAT_R32F;
3620		case FORMAT_G16R16F:		return FORMAT_G32R32F;
3621		case FORMAT_B16G16R16F:     return FORMAT_A32B32G32R32F;
3622		case FORMAT_A16B16G16R16F:	return FORMAT_A32B32G32R32F;
3623		case FORMAT_A32F:			return FORMAT_A32B32G32R32F;
3624		case FORMAT_R32F:			return FORMAT_R32F;
3625		case FORMAT_G32R32F:		return FORMAT_G32R32F;
3626		case FORMAT_B32G32R32F:     return FORMAT_A32B32G32R32F;
3627		case FORMAT_A32B32G32R32F:	return FORMAT_A32B32G32R32F;
3628		// Luminance formats
3629		case FORMAT_L8:				return FORMAT_L8;
3630		case FORMAT_A4L4:			return FORMAT_A8L8;
3631		case FORMAT_L16:			return FORMAT_L16;
3632		case FORMAT_A8L8:			return FORMAT_A8L8;
3633		case FORMAT_L16F:           return FORMAT_A32B32G32R32F;
3634		case FORMAT_A16L16F:        return FORMAT_A32B32G32R32F;
3635		case FORMAT_L32F:           return FORMAT_A32B32G32R32F;
3636		case FORMAT_A32L32F:        return FORMAT_A32B32G32R32F;
3637		// Depth/stencil formats
3638		case FORMAT_D16:
3639		case FORMAT_D32:
3640		case FORMAT_D24X8:
3641		case FORMAT_D24S8:
3642		case FORMAT_D24FS8:
3643			if(hasParent)   // Texture
3644			{
3645				return FORMAT_D32FS8_SHADOW;
3646			}
3647			else if(complementaryDepthBuffer)
3648			{
3649				return FORMAT_D32F_COMPLEMENTARY;
3650			}
3651			else
3652			{
3653				return FORMAT_D32F;
3654			}
3655		case FORMAT_D32F:           return FORMAT_D32F;
3656		case FORMAT_D32F_LOCKABLE:  return FORMAT_D32F_LOCKABLE;
3657		case FORMAT_D32FS8_TEXTURE: return FORMAT_D32FS8_TEXTURE;
3658		case FORMAT_INTZ:           return FORMAT_D32FS8_TEXTURE;
3659		case FORMAT_DF24S8:         return FORMAT_D32FS8_SHADOW;
3660		case FORMAT_DF16S8:         return FORMAT_D32FS8_SHADOW;
3661		case FORMAT_YV12_BT601:     return FORMAT_YV12_BT601;
3662		case FORMAT_YV12_BT709:     return FORMAT_YV12_BT709;
3663		case FORMAT_YV12_JFIF:      return FORMAT_YV12_JFIF;
3664		default:
3665			ASSERT(false);
3666		}
3667
3668		return FORMAT_NULL;
3669	}
3670
3671	void Surface::setTexturePalette(unsigned int *palette)
3672	{
3673		Surface::palette = palette;
3674		Surface::paletteID++;
3675	}
3676
3677	void Surface::resolve()
3678	{
3679		if(internal.depth <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL)
3680		{
3681			return;
3682		}
3683
3684		void *source = internal.lockRect(0, 0, 0, LOCK_READWRITE);
3685
3686		int quality = internal.depth;
3687		int width = internal.width;
3688		int height = internal.height;
3689		int pitch = internal.pitchB;
3690		int slice = internal.sliceB;
3691
3692		unsigned char *source0 = (unsigned char*)source;
3693		unsigned char *source1 = source0 + slice;
3694		unsigned char *source2 = source1 + slice;
3695		unsigned char *source3 = source2 + slice;
3696		unsigned char *source4 = source3 + slice;
3697		unsigned char *source5 = source4 + slice;
3698		unsigned char *source6 = source5 + slice;
3699		unsigned char *source7 = source6 + slice;
3700		unsigned char *source8 = source7 + slice;
3701		unsigned char *source9 = source8 + slice;
3702		unsigned char *sourceA = source9 + slice;
3703		unsigned char *sourceB = sourceA + slice;
3704		unsigned char *sourceC = sourceB + slice;
3705		unsigned char *sourceD = sourceC + slice;
3706		unsigned char *sourceE = sourceD + slice;
3707		unsigned char *sourceF = sourceE + slice;
3708
3709		if(internal.format == FORMAT_X8R8G8B8 || internal.format == FORMAT_A8R8G8B8 || internal.format == FORMAT_X8B8G8R8 || internal.format == FORMAT_A8B8G8R8)
3710		{
3711			if(CPUID::supportsSSE2() && (width % 4) == 0)
3712			{
3713				if(internal.depth == 2)
3714				{
3715					for(int y = 0; y < height; y++)
3716					{
3717						for(int x = 0; x < width; x += 4)
3718						{
3719							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3720							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3721
3722							c0 = _mm_avg_epu8(c0, c1);
3723
3724							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3725						}
3726
3727						source0 += pitch;
3728						source1 += pitch;
3729					}
3730				}
3731				else if(internal.depth == 4)
3732				{
3733					for(int y = 0; y < height; y++)
3734					{
3735						for(int x = 0; x < width; x += 4)
3736						{
3737							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3738							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3739							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3740							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3741
3742							c0 = _mm_avg_epu8(c0, c1);
3743							c2 = _mm_avg_epu8(c2, c3);
3744							c0 = _mm_avg_epu8(c0, c2);
3745
3746							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3747						}
3748
3749						source0 += pitch;
3750						source1 += pitch;
3751						source2 += pitch;
3752						source3 += pitch;
3753					}
3754				}
3755				else if(internal.depth == 8)
3756				{
3757					for(int y = 0; y < height; y++)
3758					{
3759						for(int x = 0; x < width; x += 4)
3760						{
3761							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3762							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3763							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3764							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3765							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3766							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3767							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3768							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3769
3770							c0 = _mm_avg_epu8(c0, c1);
3771							c2 = _mm_avg_epu8(c2, c3);
3772							c4 = _mm_avg_epu8(c4, c5);
3773							c6 = _mm_avg_epu8(c6, c7);
3774							c0 = _mm_avg_epu8(c0, c2);
3775							c4 = _mm_avg_epu8(c4, c6);
3776							c0 = _mm_avg_epu8(c0, c4);
3777
3778							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3779						}
3780
3781						source0 += pitch;
3782						source1 += pitch;
3783						source2 += pitch;
3784						source3 += pitch;
3785						source4 += pitch;
3786						source5 += pitch;
3787						source6 += pitch;
3788						source7 += pitch;
3789					}
3790				}
3791				else if(internal.depth == 16)
3792				{
3793					for(int y = 0; y < height; y++)
3794					{
3795						for(int x = 0; x < width; x += 4)
3796						{
3797							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3798							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3799							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3800							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3801							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3802							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3803							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3804							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3805							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
3806							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
3807							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
3808							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
3809							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
3810							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
3811							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
3812							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
3813
3814							c0 = _mm_avg_epu8(c0, c1);
3815							c2 = _mm_avg_epu8(c2, c3);
3816							c4 = _mm_avg_epu8(c4, c5);
3817							c6 = _mm_avg_epu8(c6, c7);
3818							c8 = _mm_avg_epu8(c8, c9);
3819							cA = _mm_avg_epu8(cA, cB);
3820							cC = _mm_avg_epu8(cC, cD);
3821							cE = _mm_avg_epu8(cE, cF);
3822							c0 = _mm_avg_epu8(c0, c2);
3823							c4 = _mm_avg_epu8(c4, c6);
3824							c8 = _mm_avg_epu8(c8, cA);
3825							cC = _mm_avg_epu8(cC, cE);
3826							c0 = _mm_avg_epu8(c0, c4);
3827							c8 = _mm_avg_epu8(c8, cC);
3828							c0 = _mm_avg_epu8(c0, c8);
3829
3830							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3831						}
3832
3833						source0 += pitch;
3834						source1 += pitch;
3835						source2 += pitch;
3836						source3 += pitch;
3837						source4 += pitch;
3838						source5 += pitch;
3839						source6 += pitch;
3840						source7 += pitch;
3841						source8 += pitch;
3842						source9 += pitch;
3843						sourceA += pitch;
3844						sourceB += pitch;
3845						sourceC += pitch;
3846						sourceD += pitch;
3847						sourceE += pitch;
3848						sourceF += pitch;
3849					}
3850				}
3851				else ASSERT(false);
3852			}
3853			else
3854			{
3855				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))
3856
3857				if(internal.depth == 2)
3858				{
3859					for(int y = 0; y < height; y++)
3860					{
3861						for(int x = 0; x < width; x++)
3862						{
3863							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3864							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3865
3866							c0 = AVERAGE(c0, c1);
3867
3868							*(unsigned int*)(source0 + 4 * x) = c0;
3869						}
3870
3871						source0 += pitch;
3872						source1 += pitch;
3873					}
3874				}
3875				else if(internal.depth == 4)
3876				{
3877					for(int y = 0; y < height; y++)
3878					{
3879						for(int x = 0; x < width; x++)
3880						{
3881							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3882							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3883							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3884							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3885
3886							c0 = AVERAGE(c0, c1);
3887							c2 = AVERAGE(c2, c3);
3888							c0 = AVERAGE(c0, c2);
3889
3890							*(unsigned int*)(source0 + 4 * x) = c0;
3891						}
3892
3893						source0 += pitch;
3894						source1 += pitch;
3895						source2 += pitch;
3896						source3 += pitch;
3897					}
3898				}
3899				else if(internal.depth == 8)
3900				{
3901					for(int y = 0; y < height; y++)
3902					{
3903						for(int x = 0; x < width; x++)
3904						{
3905							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3906							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3907							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3908							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3909							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3910							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3911							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3912							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3913
3914							c0 = AVERAGE(c0, c1);
3915							c2 = AVERAGE(c2, c3);
3916							c4 = AVERAGE(c4, c5);
3917							c6 = AVERAGE(c6, c7);
3918							c0 = AVERAGE(c0, c2);
3919							c4 = AVERAGE(c4, c6);
3920							c0 = AVERAGE(c0, c4);
3921
3922							*(unsigned int*)(source0 + 4 * x) = c0;
3923						}
3924
3925						source0 += pitch;
3926						source1 += pitch;
3927						source2 += pitch;
3928						source3 += pitch;
3929						source4 += pitch;
3930						source5 += pitch;
3931						source6 += pitch;
3932						source7 += pitch;
3933					}
3934				}
3935				else if(internal.depth == 16)
3936				{
3937					for(int y = 0; y < height; y++)
3938					{
3939						for(int x = 0; x < width; x++)
3940						{
3941							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3942							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3943							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3944							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3945							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3946							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3947							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3948							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3949							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
3950							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
3951							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
3952							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
3953							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
3954							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
3955							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
3956							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
3957
3958							c0 = AVERAGE(c0, c1);
3959							c2 = AVERAGE(c2, c3);
3960							c4 = AVERAGE(c4, c5);
3961							c6 = AVERAGE(c6, c7);
3962							c8 = AVERAGE(c8, c9);
3963							cA = AVERAGE(cA, cB);
3964							cC = AVERAGE(cC, cD);
3965							cE = AVERAGE(cE, cF);
3966							c0 = AVERAGE(c0, c2);
3967							c4 = AVERAGE(c4, c6);
3968							c8 = AVERAGE(c8, cA);
3969							cC = AVERAGE(cC, cE);
3970							c0 = AVERAGE(c0, c4);
3971							c8 = AVERAGE(c8, cC);
3972							c0 = AVERAGE(c0, c8);
3973
3974							*(unsigned int*)(source0 + 4 * x) = c0;
3975						}
3976
3977						source0 += pitch;
3978						source1 += pitch;
3979						source2 += pitch;
3980						source3 += pitch;
3981						source4 += pitch;
3982						source5 += pitch;
3983						source6 += pitch;
3984						source7 += pitch;
3985						source8 += pitch;
3986						source9 += pitch;
3987						sourceA += pitch;
3988						sourceB += pitch;
3989						sourceC += pitch;
3990						sourceD += pitch;
3991						sourceE += pitch;
3992						sourceF += pitch;
3993					}
3994				}
3995				else ASSERT(false);
3996
3997				#undef AVERAGE
3998			}
3999		}
4000		else if(internal.format == FORMAT_G16R16)
4001		{
4002			if(CPUID::supportsSSE2() && (width % 4) == 0)
4003			{
4004				if(internal.depth == 2)
4005				{
4006					for(int y = 0; y < height; y++)
4007					{
4008						for(int x = 0; x < width; x += 4)
4009						{
4010							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4011							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4012
4013							c0 = _mm_avg_epu16(c0, c1);
4014
4015							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4016						}
4017
4018						source0 += pitch;
4019						source1 += pitch;
4020					}
4021				}
4022				else if(internal.depth == 4)
4023				{
4024					for(int y = 0; y < height; y++)
4025					{
4026						for(int x = 0; x < width; x += 4)
4027						{
4028							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4029							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4030							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4031							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4032
4033							c0 = _mm_avg_epu16(c0, c1);
4034							c2 = _mm_avg_epu16(c2, c3);
4035							c0 = _mm_avg_epu16(c0, c2);
4036
4037							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4038						}
4039
4040						source0 += pitch;
4041						source1 += pitch;
4042						source2 += pitch;
4043						source3 += pitch;
4044					}
4045				}
4046				else if(internal.depth == 8)
4047				{
4048					for(int y = 0; y < height; y++)
4049					{
4050						for(int x = 0; x < width; x += 4)
4051						{
4052							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4053							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4054							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4055							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4056							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4057							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4058							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4059							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4060
4061							c0 = _mm_avg_epu16(c0, c1);
4062							c2 = _mm_avg_epu16(c2, c3);
4063							c4 = _mm_avg_epu16(c4, c5);
4064							c6 = _mm_avg_epu16(c6, c7);
4065							c0 = _mm_avg_epu16(c0, c2);
4066							c4 = _mm_avg_epu16(c4, c6);
4067							c0 = _mm_avg_epu16(c0, c4);
4068
4069							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4070						}
4071
4072						source0 += pitch;
4073						source1 += pitch;
4074						source2 += pitch;
4075						source3 += pitch;
4076						source4 += pitch;
4077						source5 += pitch;
4078						source6 += pitch;
4079						source7 += pitch;
4080					}
4081				}
4082				else if(internal.depth == 16)
4083				{
4084					for(int y = 0; y < height; y++)
4085					{
4086						for(int x = 0; x < width; x += 4)
4087						{
4088							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4089							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4090							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4091							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4092							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4093							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4094							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4095							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4096							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
4097							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
4098							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
4099							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
4100							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
4101							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
4102							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
4103							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
4104
4105							c0 = _mm_avg_epu16(c0, c1);
4106							c2 = _mm_avg_epu16(c2, c3);
4107							c4 = _mm_avg_epu16(c4, c5);
4108							c6 = _mm_avg_epu16(c6, c7);
4109							c8 = _mm_avg_epu16(c8, c9);
4110							cA = _mm_avg_epu16(cA, cB);
4111							cC = _mm_avg_epu16(cC, cD);
4112							cE = _mm_avg_epu16(cE, cF);
4113							c0 = _mm_avg_epu16(c0, c2);
4114							c4 = _mm_avg_epu16(c4, c6);
4115							c8 = _mm_avg_epu16(c8, cA);
4116							cC = _mm_avg_epu16(cC, cE);
4117							c0 = _mm_avg_epu16(c0, c4);
4118							c8 = _mm_avg_epu16(c8, cC);
4119							c0 = _mm_avg_epu16(c0, c8);
4120
4121							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4122						}
4123
4124						source0 += pitch;
4125						source1 += pitch;
4126						source2 += pitch;
4127						source3 += pitch;
4128						source4 += pitch;
4129						source5 += pitch;
4130						source6 += pitch;
4131						source7 += pitch;
4132						source8 += pitch;
4133						source9 += pitch;
4134						sourceA += pitch;
4135						sourceB += pitch;
4136						sourceC += pitch;
4137						sourceD += pitch;
4138						sourceE += pitch;
4139						sourceF += pitch;
4140					}
4141				}
4142				else ASSERT(false);
4143			}
4144			else
4145			{
4146				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
4147
4148				if(internal.depth == 2)
4149				{
4150					for(int y = 0; y < height; y++)
4151					{
4152						for(int x = 0; x < width; x++)
4153						{
4154							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4155							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4156
4157							c0 = AVERAGE(c0, c1);
4158
4159							*(unsigned int*)(source0 + 4 * x) = c0;
4160						}
4161
4162						source0 += pitch;
4163						source1 += pitch;
4164					}
4165				}
4166				else if(internal.depth == 4)
4167				{
4168					for(int y = 0; y < height; y++)
4169					{
4170						for(int x = 0; x < width; x++)
4171						{
4172							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4173							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4174							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4175							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4176
4177							c0 = AVERAGE(c0, c1);
4178							c2 = AVERAGE(c2, c3);
4179							c0 = AVERAGE(c0, c2);
4180
4181							*(unsigned int*)(source0 + 4 * x) = c0;
4182						}
4183
4184						source0 += pitch;
4185						source1 += pitch;
4186						source2 += pitch;
4187						source3 += pitch;
4188					}
4189				}
4190				else if(internal.depth == 8)
4191				{
4192					for(int y = 0; y < height; y++)
4193					{
4194						for(int x = 0; x < width; x++)
4195						{
4196							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4197							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4198							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4199							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4200							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4201							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4202							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4203							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4204
4205							c0 = AVERAGE(c0, c1);
4206							c2 = AVERAGE(c2, c3);
4207							c4 = AVERAGE(c4, c5);
4208							c6 = AVERAGE(c6, c7);
4209							c0 = AVERAGE(c0, c2);
4210							c4 = AVERAGE(c4, c6);
4211							c0 = AVERAGE(c0, c4);
4212
4213							*(unsigned int*)(source0 + 4 * x) = c0;
4214						}
4215
4216						source0 += pitch;
4217						source1 += pitch;
4218						source2 += pitch;
4219						source3 += pitch;
4220						source4 += pitch;
4221						source5 += pitch;
4222						source6 += pitch;
4223						source7 += pitch;
4224					}
4225				}
4226				else if(internal.depth == 16)
4227				{
4228					for(int y = 0; y < height; y++)
4229					{
4230						for(int x = 0; x < width; x++)
4231						{
4232							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4233							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4234							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4235							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4236							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4237							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4238							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4239							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4240							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4241							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4242							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4243							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4244							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4245							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4246							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4247							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4248
4249							c0 = AVERAGE(c0, c1);
4250							c2 = AVERAGE(c2, c3);
4251							c4 = AVERAGE(c4, c5);
4252							c6 = AVERAGE(c6, c7);
4253							c8 = AVERAGE(c8, c9);
4254							cA = AVERAGE(cA, cB);
4255							cC = AVERAGE(cC, cD);
4256							cE = AVERAGE(cE, cF);
4257							c0 = AVERAGE(c0, c2);
4258							c4 = AVERAGE(c4, c6);
4259							c8 = AVERAGE(c8, cA);
4260							cC = AVERAGE(cC, cE);
4261							c0 = AVERAGE(c0, c4);
4262							c8 = AVERAGE(c8, cC);
4263							c0 = AVERAGE(c0, c8);
4264
4265							*(unsigned int*)(source0 + 4 * x) = c0;
4266						}
4267
4268						source0 += pitch;
4269						source1 += pitch;
4270						source2 += pitch;
4271						source3 += pitch;
4272						source4 += pitch;
4273						source5 += pitch;
4274						source6 += pitch;
4275						source7 += pitch;
4276						source8 += pitch;
4277						source9 += pitch;
4278						sourceA += pitch;
4279						sourceB += pitch;
4280						sourceC += pitch;
4281						sourceD += pitch;
4282						sourceE += pitch;
4283						sourceF += pitch;
4284					}
4285				}
4286				else ASSERT(false);
4287
4288				#undef AVERAGE
4289			}
4290		}
4291		else if(internal.format == FORMAT_A16B16G16R16)
4292		{
4293			if(CPUID::supportsSSE2() && (width % 2) == 0)
4294			{
4295				if(internal.depth == 2)
4296				{
4297					for(int y = 0; y < height; y++)
4298					{
4299						for(int x = 0; x < width; x += 2)
4300						{
4301							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4302							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4303
4304							c0 = _mm_avg_epu16(c0, c1);
4305
4306							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4307						}
4308
4309						source0 += pitch;
4310						source1 += pitch;
4311					}
4312				}
4313				else if(internal.depth == 4)
4314				{
4315					for(int y = 0; y < height; y++)
4316					{
4317						for(int x = 0; x < width; x += 2)
4318						{
4319							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4320							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4321							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4322							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4323
4324							c0 = _mm_avg_epu16(c0, c1);
4325							c2 = _mm_avg_epu16(c2, c3);
4326							c0 = _mm_avg_epu16(c0, c2);
4327
4328							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4329						}
4330
4331						source0 += pitch;
4332						source1 += pitch;
4333						source2 += pitch;
4334						source3 += pitch;
4335					}
4336				}
4337				else if(internal.depth == 8)
4338				{
4339					for(int y = 0; y < height; y++)
4340					{
4341						for(int x = 0; x < width; x += 2)
4342						{
4343							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4344							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4345							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4346							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4347							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4348							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4349							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4350							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4351
4352							c0 = _mm_avg_epu16(c0, c1);
4353							c2 = _mm_avg_epu16(c2, c3);
4354							c4 = _mm_avg_epu16(c4, c5);
4355							c6 = _mm_avg_epu16(c6, c7);
4356							c0 = _mm_avg_epu16(c0, c2);
4357							c4 = _mm_avg_epu16(c4, c6);
4358							c0 = _mm_avg_epu16(c0, c4);
4359
4360							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4361						}
4362
4363						source0 += pitch;
4364						source1 += pitch;
4365						source2 += pitch;
4366						source3 += pitch;
4367						source4 += pitch;
4368						source5 += pitch;
4369						source6 += pitch;
4370						source7 += pitch;
4371					}
4372				}
4373				else if(internal.depth == 16)
4374				{
4375					for(int y = 0; y < height; y++)
4376					{
4377						for(int x = 0; x < width; x += 2)
4378						{
4379							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4380							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4381							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4382							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4383							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4384							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4385							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4386							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4387							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
4388							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
4389							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
4390							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
4391							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
4392							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
4393							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
4394							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
4395
4396							c0 = _mm_avg_epu16(c0, c1);
4397							c2 = _mm_avg_epu16(c2, c3);
4398							c4 = _mm_avg_epu16(c4, c5);
4399							c6 = _mm_avg_epu16(c6, c7);
4400							c8 = _mm_avg_epu16(c8, c9);
4401							cA = _mm_avg_epu16(cA, cB);
4402							cC = _mm_avg_epu16(cC, cD);
4403							cE = _mm_avg_epu16(cE, cF);
4404							c0 = _mm_avg_epu16(c0, c2);
4405							c4 = _mm_avg_epu16(c4, c6);
4406							c8 = _mm_avg_epu16(c8, cA);
4407							cC = _mm_avg_epu16(cC, cE);
4408							c0 = _mm_avg_epu16(c0, c4);
4409							c8 = _mm_avg_epu16(c8, cC);
4410							c0 = _mm_avg_epu16(c0, c8);
4411
4412							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4413						}
4414
4415						source0 += pitch;
4416						source1 += pitch;
4417						source2 += pitch;
4418						source3 += pitch;
4419						source4 += pitch;
4420						source5 += pitch;
4421						source6 += pitch;
4422						source7 += pitch;
4423						source8 += pitch;
4424						source9 += pitch;
4425						sourceA += pitch;
4426						sourceB += pitch;
4427						sourceC += pitch;
4428						sourceD += pitch;
4429						sourceE += pitch;
4430						sourceF += pitch;
4431					}
4432				}
4433				else ASSERT(false);
4434			}
4435			else
4436			{
4437				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
4438
4439				if(internal.depth == 2)
4440				{
4441					for(int y = 0; y < height; y++)
4442					{
4443						for(int x = 0; x < 2 * width; x++)
4444						{
4445							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4446							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4447
4448							c0 = AVERAGE(c0, c1);
4449
4450							*(unsigned int*)(source0 + 4 * x) = c0;
4451						}
4452
4453						source0 += pitch;
4454						source1 += pitch;
4455					}
4456				}
4457				else if(internal.depth == 4)
4458				{
4459					for(int y = 0; y < height; y++)
4460					{
4461						for(int x = 0; x < 2 * width; x++)
4462						{
4463							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4464							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4465							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4466							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4467
4468							c0 = AVERAGE(c0, c1);
4469							c2 = AVERAGE(c2, c3);
4470							c0 = AVERAGE(c0, c2);
4471
4472							*(unsigned int*)(source0 + 4 * x) = c0;
4473						}
4474
4475						source0 += pitch;
4476						source1 += pitch;
4477						source2 += pitch;
4478						source3 += pitch;
4479					}
4480				}
4481				else if(internal.depth == 8)
4482				{
4483					for(int y = 0; y < height; y++)
4484					{
4485						for(int x = 0; x < 2 * width; x++)
4486						{
4487							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4488							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4489							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4490							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4491							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4492							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4493							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4494							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4495
4496							c0 = AVERAGE(c0, c1);
4497							c2 = AVERAGE(c2, c3);
4498							c4 = AVERAGE(c4, c5);
4499							c6 = AVERAGE(c6, c7);
4500							c0 = AVERAGE(c0, c2);
4501							c4 = AVERAGE(c4, c6);
4502							c0 = AVERAGE(c0, c4);
4503
4504							*(unsigned int*)(source0 + 4 * x) = c0;
4505						}
4506
4507						source0 += pitch;
4508						source1 += pitch;
4509						source2 += pitch;
4510						source3 += pitch;
4511						source4 += pitch;
4512						source5 += pitch;
4513						source6 += pitch;
4514						source7 += pitch;
4515					}
4516				}
4517				else if(internal.depth == 16)
4518				{
4519					for(int y = 0; y < height; y++)
4520					{
4521						for(int x = 0; x < 2 * width; x++)
4522						{
4523							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4524							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4525							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4526							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4527							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4528							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4529							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4530							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4531							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4532							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4533							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4534							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4535							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4536							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4537							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4538							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4539
4540							c0 = AVERAGE(c0, c1);
4541							c2 = AVERAGE(c2, c3);
4542							c4 = AVERAGE(c4, c5);
4543							c6 = AVERAGE(c6, c7);
4544							c8 = AVERAGE(c8, c9);
4545							cA = AVERAGE(cA, cB);
4546							cC = AVERAGE(cC, cD);
4547							cE = AVERAGE(cE, cF);
4548							c0 = AVERAGE(c0, c2);
4549							c4 = AVERAGE(c4, c6);
4550							c8 = AVERAGE(c8, cA);
4551							cC = AVERAGE(cC, cE);
4552							c0 = AVERAGE(c0, c4);
4553							c8 = AVERAGE(c8, cC);
4554							c0 = AVERAGE(c0, c8);
4555
4556							*(unsigned int*)(source0 + 4 * x) = c0;
4557						}
4558
4559						source0 += pitch;
4560						source1 += pitch;
4561						source2 += pitch;
4562						source3 += pitch;
4563						source4 += pitch;
4564						source5 += pitch;
4565						source6 += pitch;
4566						source7 += pitch;
4567						source8 += pitch;
4568						source9 += pitch;
4569						sourceA += pitch;
4570						sourceB += pitch;
4571						sourceC += pitch;
4572						sourceD += pitch;
4573						sourceE += pitch;
4574						sourceF += pitch;
4575					}
4576				}
4577				else ASSERT(false);
4578
4579				#undef AVERAGE
4580			}
4581		}
4582		else if(internal.format == FORMAT_R32F)
4583		{
4584			if(CPUID::supportsSSE() && (width % 4) == 0)
4585			{
4586				if(internal.depth == 2)
4587				{
4588					for(int y = 0; y < height; y++)
4589					{
4590						for(int x = 0; x < width; x += 4)
4591						{
4592							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4593							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4594
4595							c0 = _mm_add_ps(c0, c1);
4596							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
4597
4598							_mm_store_ps((float*)(source0 + 4 * x), c0);
4599						}
4600
4601						source0 += pitch;
4602						source1 += pitch;
4603					}
4604				}
4605				else if(internal.depth == 4)
4606				{
4607					for(int y = 0; y < height; y++)
4608					{
4609						for(int x = 0; x < width; x += 4)
4610						{
4611							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4612							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4613							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4614							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4615
4616							c0 = _mm_add_ps(c0, c1);
4617							c2 = _mm_add_ps(c2, c3);
4618							c0 = _mm_add_ps(c0, c2);
4619							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
4620
4621							_mm_store_ps((float*)(source0 + 4 * x), c0);
4622						}
4623
4624						source0 += pitch;
4625						source1 += pitch;
4626						source2 += pitch;
4627						source3 += pitch;
4628					}
4629				}
4630				else if(internal.depth == 8)
4631				{
4632					for(int y = 0; y < height; y++)
4633					{
4634						for(int x = 0; x < width; x += 4)
4635						{
4636							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4637							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4638							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4639							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4640							__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
4641							__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
4642							__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
4643							__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
4644
4645							c0 = _mm_add_ps(c0, c1);
4646							c2 = _mm_add_ps(c2, c3);
4647							c4 = _mm_add_ps(c4, c5);
4648							c6 = _mm_add_ps(c6, c7);
4649							c0 = _mm_add_ps(c0, c2);
4650							c4 = _mm_add_ps(c4, c6);
4651							c0 = _mm_add_ps(c0, c4);
4652							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
4653
4654							_mm_store_ps((float*)(source0 + 4 * x), c0);
4655						}
4656
4657						source0 += pitch;
4658						source1 += pitch;
4659						source2 += pitch;
4660						source3 += pitch;
4661						source4 += pitch;
4662						source5 += pitch;
4663						source6 += pitch;
4664						source7 += pitch;
4665					}
4666				}
4667				else if(internal.depth == 16)
4668				{
4669					for(int y = 0; y < height; y++)
4670					{
4671						for(int x = 0; x < width; x += 4)
4672						{
4673							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4674							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4675							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4676							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4677							__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
4678							__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
4679							__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
4680							__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
4681							__m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
4682							__m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
4683							__m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
4684							__m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
4685							__m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
4686							__m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
4687							__m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
4688							__m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
4689
4690							c0 = _mm_add_ps(c0, c1);
4691							c2 = _mm_add_ps(c2, c3);
4692							c4 = _mm_add_ps(c4, c5);
4693							c6 = _mm_add_ps(c6, c7);
4694							c8 = _mm_add_ps(c8, c9);
4695							cA = _mm_add_ps(cA, cB);
4696							cC = _mm_add_ps(cC, cD);
4697							cE = _mm_add_ps(cE, cF);
4698							c0 = _mm_add_ps(c0, c2);
4699							c4 = _mm_add_ps(c4, c6);
4700							c8 = _mm_add_ps(c8, cA);
4701							cC = _mm_add_ps(cC, cE);
4702							c0 = _mm_add_ps(c0, c4);
4703							c8 = _mm_add_ps(c8, cC);
4704							c0 = _mm_add_ps(c0, c8);
4705							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
4706
4707							_mm_store_ps((float*)(source0 + 4 * x), c0);
4708						}
4709
4710						source0 += pitch;
4711						source1 += pitch;
4712						source2 += pitch;
4713						source3 += pitch;
4714						source4 += pitch;
4715						source5 += pitch;
4716						source6 += pitch;
4717						source7 += pitch;
4718						source8 += pitch;
4719						source9 += pitch;
4720						sourceA += pitch;
4721						sourceB += pitch;
4722						sourceC += pitch;
4723						sourceD += pitch;
4724						sourceE += pitch;
4725						sourceF += pitch;
4726					}
4727				}
4728				else ASSERT(false);
4729			}
4730			else
4731			{
4732				if(internal.depth == 2)
4733				{
4734					for(int y = 0; y < height; y++)
4735					{
4736						for(int x = 0; x < width; x++)
4737						{
4738							float c0 = *(float*)(source0 + 4 * x);
4739							float c1 = *(float*)(source1 + 4 * x);
4740
4741							c0 = c0 + c1;
4742							c0 *= 1.0f / 2.0f;
4743
4744							*(float*)(source0 + 4 * x) = c0;
4745						}
4746
4747						source0 += pitch;
4748						source1 += pitch;
4749					}
4750				}
4751				else if(internal.depth == 4)
4752				{
4753					for(int y = 0; y < height; y++)
4754					{
4755						for(int x = 0; x < width; x++)
4756						{
4757							float c0 = *(float*)(source0 + 4 * x);
4758							float c1 = *(float*)(source1 + 4 * x);
4759							float c2 = *(float*)(source2 + 4 * x);
4760							float c3 = *(float*)(source3 + 4 * x);
4761
4762							c0 = c0 + c1;
4763							c2 = c2 + c3;
4764							c0 = c0 + c2;
4765							c0 *= 1.0f / 4.0f;
4766
4767							*(float*)(source0 + 4 * x) = c0;
4768						}
4769
4770						source0 += pitch;
4771						source1 += pitch;
4772						source2 += pitch;
4773						source3 += pitch;
4774					}
4775				}
4776				else if(internal.depth == 8)
4777				{
4778					for(int y = 0; y < height; y++)
4779					{
4780						for(int x = 0; x < width; x++)
4781						{
4782							float c0 = *(float*)(source0 + 4 * x);
4783							float c1 = *(float*)(source1 + 4 * x);
4784							float c2 = *(float*)(source2 + 4 * x);
4785							float c3 = *(float*)(source3 + 4 * x);
4786							float c4 = *(float*)(source4 + 4 * x);
4787							float c5 = *(float*)(source5 + 4 * x);
4788							float c6 = *(float*)(source6 + 4 * x);
4789							float c7 = *(float*)(source7 + 4 * x);
4790
4791							c0 = c0 + c1;
4792							c2 = c2 + c3;
4793							c4 = c4 + c5;
4794							c6 = c6 + c7;
4795							c0 = c0 + c2;
4796							c4 = c4 + c6;
4797							c0 = c0 + c4;
4798							c0 *= 1.0f / 8.0f;
4799
4800							*(float*)(source0 + 4 * x) = c0;
4801						}
4802
4803						source0 += pitch;
4804						source1 += pitch;
4805						source2 += pitch;
4806						source3 += pitch;
4807						source4 += pitch;
4808						source5 += pitch;
4809						source6 += pitch;
4810						source7 += pitch;
4811					}
4812				}
4813				else if(internal.depth == 16)
4814				{
4815					for(int y = 0; y < height; y++)
4816					{
4817						for(int x = 0; x < width; x++)
4818						{
4819							float c0 = *(float*)(source0 + 4 * x);
4820							float c1 = *(float*)(source1 + 4 * x);
4821							float c2 = *(float*)(source2 + 4 * x);
4822							float c3 = *(float*)(source3 + 4 * x);
4823							float c4 = *(float*)(source4 + 4 * x);
4824							float c5 = *(float*)(source5 + 4 * x);
4825							float c6 = *(float*)(source6 + 4 * x);
4826							float c7 = *(float*)(source7 + 4 * x);
4827							float c8 = *(float*)(source8 + 4 * x);
4828							float c9 = *(float*)(source9 + 4 * x);
4829							float cA = *(float*)(sourceA + 4 * x);
4830							float cB = *(float*)(sourceB + 4 * x);
4831							float cC = *(float*)(sourceC + 4 * x);
4832							float cD = *(float*)(sourceD + 4 * x);
4833							float cE = *(float*)(sourceE + 4 * x);
4834							float cF = *(float*)(sourceF + 4 * x);
4835
4836							c0 = c0 + c1;
4837							c2 = c2 + c3;
4838							c4 = c4 + c5;
4839							c6 = c6 + c7;
4840							c8 = c8 + c9;
4841							cA = cA + cB;
4842							cC = cC + cD;
4843							cE = cE + cF;
4844							c0 = c0 + c2;
4845							c4 = c4 + c6;
4846							c8 = c8 + cA;
4847							cC = cC + cE;
4848							c0 = c0 + c4;
4849							c8 = c8 + cC;
4850							c0 = c0 + c8;
4851							c0 *= 1.0f / 16.0f;
4852
4853							*(float*)(source0 + 4 * x) = c0;
4854						}
4855
4856						source0 += pitch;
4857						source1 += pitch;
4858						source2 += pitch;
4859						source3 += pitch;
4860						source4 += pitch;
4861						source5 += pitch;
4862						source6 += pitch;
4863						source7 += pitch;
4864						source8 += pitch;
4865						source9 += pitch;
4866						sourceA += pitch;
4867						sourceB += pitch;
4868						sourceC += pitch;
4869						sourceD += pitch;
4870						sourceE += pitch;
4871						sourceF += pitch;
4872					}
4873				}
4874				else ASSERT(false);
4875			}
4876		}
4877		else if(internal.format == FORMAT_G32R32F)
4878		{
4879			if(CPUID::supportsSSE() && (width % 2) == 0)
4880			{
4881				if(internal.depth == 2)
4882				{
4883					for(int y = 0; y < height; y++)
4884					{
4885						for(int x = 0; x < width; x += 2)
4886						{
4887							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4888							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4889
4890							c0 = _mm_add_ps(c0, c1);
4891							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
4892
4893							_mm_store_ps((float*)(source0 + 8 * x), c0);
4894						}
4895
4896						source0 += pitch;
4897						source1 += pitch;
4898					}
4899				}
4900				else if(internal.depth == 4)
4901				{
4902					for(int y = 0; y < height; y++)
4903					{
4904						for(int x = 0; x < width; x += 2)
4905						{
4906							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4907							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4908							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4909							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4910
4911							c0 = _mm_add_ps(c0, c1);
4912							c2 = _mm_add_ps(c2, c3);
4913							c0 = _mm_add_ps(c0, c2);
4914							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
4915
4916							_mm_store_ps((float*)(source0 + 8 * x), c0);
4917						}
4918
4919						source0 += pitch;
4920						source1 += pitch;
4921						source2 += pitch;
4922						source3 += pitch;
4923					}
4924				}
4925				else if(internal.depth == 8)
4926				{
4927					for(int y = 0; y < height; y++)
4928					{
4929						for(int x = 0; x < width; x += 2)
4930						{
4931							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4932							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4933							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4934							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4935							__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
4936							__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
4937							__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
4938							__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
4939
4940							c0 = _mm_add_ps(c0, c1);
4941							c2 = _mm_add_ps(c2, c3);
4942							c4 = _mm_add_ps(c4, c5);
4943							c6 = _mm_add_ps(c6, c7);
4944							c0 = _mm_add_ps(c0, c2);
4945							c4 = _mm_add_ps(c4, c6);
4946							c0 = _mm_add_ps(c0, c4);
4947							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
4948
4949							_mm_store_ps((float*)(source0 + 8 * x), c0);
4950						}
4951
4952						source0 += pitch;
4953						source1 += pitch;
4954						source2 += pitch;
4955						source3 += pitch;
4956						source4 += pitch;
4957						source5 += pitch;
4958						source6 += pitch;
4959						source7 += pitch;
4960					}
4961				}
4962				else if(internal.depth == 16)
4963				{
4964					for(int y = 0; y < height; y++)
4965					{
4966						for(int x = 0; x < width; x += 2)
4967						{
4968							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4969							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4970							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4971							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4972							__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
4973							__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
4974							__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
4975							__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
4976							__m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
4977							__m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
4978							__m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
4979							__m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
4980							__m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
4981							__m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
4982							__m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
4983							__m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
4984
4985							c0 = _mm_add_ps(c0, c1);
4986							c2 = _mm_add_ps(c2, c3);
4987							c4 = _mm_add_ps(c4, c5);
4988							c6 = _mm_add_ps(c6, c7);
4989							c8 = _mm_add_ps(c8, c9);
4990							cA = _mm_add_ps(cA, cB);
4991							cC = _mm_add_ps(cC, cD);
4992							cE = _mm_add_ps(cE, cF);
4993							c0 = _mm_add_ps(c0, c2);
4994							c4 = _mm_add_ps(c4, c6);
4995							c8 = _mm_add_ps(c8, cA);
4996							cC = _mm_add_ps(cC, cE);
4997							c0 = _mm_add_ps(c0, c4);
4998							c8 = _mm_add_ps(c8, cC);
4999							c0 = _mm_add_ps(c0, c8);
5000							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5001
5002							_mm_store_ps((float*)(source0 + 8 * x), c0);
5003						}
5004
5005						source0 += pitch;
5006						source1 += pitch;
5007						source2 += pitch;
5008						source3 += pitch;
5009						source4 += pitch;
5010						source5 += pitch;
5011						source6 += pitch;
5012						source7 += pitch;
5013						source8 += pitch;
5014						source9 += pitch;
5015						sourceA += pitch;
5016						sourceB += pitch;
5017						sourceC += pitch;
5018						sourceD += pitch;
5019						sourceE += pitch;
5020						sourceF += pitch;
5021					}
5022				}
5023				else ASSERT(false);
5024			}
5025			else
5026			{
5027				if(internal.depth == 2)
5028				{
5029					for(int y = 0; y < height; y++)
5030					{
5031						for(int x = 0; x < 2 * width; x++)
5032						{
5033							float c0 = *(float*)(source0 + 4 * x);
5034							float c1 = *(float*)(source1 + 4 * x);
5035
5036							c0 = c0 + c1;
5037							c0 *= 1.0f / 2.0f;
5038
5039							*(float*)(source0 + 4 * x) = c0;
5040						}
5041
5042						source0 += pitch;
5043						source1 += pitch;
5044					}
5045				}
5046				else if(internal.depth == 4)
5047				{
5048					for(int y = 0; y < height; y++)
5049					{
5050						for(int x = 0; x < 2 * width; x++)
5051						{
5052							float c0 = *(float*)(source0 + 4 * x);
5053							float c1 = *(float*)(source1 + 4 * x);
5054							float c2 = *(float*)(source2 + 4 * x);
5055							float c3 = *(float*)(source3 + 4 * x);
5056
5057							c0 = c0 + c1;
5058							c2 = c2 + c3;
5059							c0 = c0 + c2;
5060							c0 *= 1.0f / 4.0f;
5061
5062							*(float*)(source0 + 4 * x) = c0;
5063						}
5064
5065						source0 += pitch;
5066						source1 += pitch;
5067						source2 += pitch;
5068						source3 += pitch;
5069					}
5070				}
5071				else if(internal.depth == 8)
5072				{
5073					for(int y = 0; y < height; y++)
5074					{
5075						for(int x = 0; x < 2 * width; x++)
5076						{
5077							float c0 = *(float*)(source0 + 4 * x);
5078							float c1 = *(float*)(source1 + 4 * x);
5079							float c2 = *(float*)(source2 + 4 * x);
5080							float c3 = *(float*)(source3 + 4 * x);
5081							float c4 = *(float*)(source4 + 4 * x);
5082							float c5 = *(float*)(source5 + 4 * x);
5083							float c6 = *(float*)(source6 + 4 * x);
5084							float c7 = *(float*)(source7 + 4 * x);
5085
5086							c0 = c0 + c1;
5087							c2 = c2 + c3;
5088							c4 = c4 + c5;
5089							c6 = c6 + c7;
5090							c0 = c0 + c2;
5091							c4 = c4 + c6;
5092							c0 = c0 + c4;
5093							c0 *= 1.0f / 8.0f;
5094
5095							*(float*)(source0 + 4 * x) = c0;
5096						}
5097
5098						source0 += pitch;
5099						source1 += pitch;
5100						source2 += pitch;
5101						source3 += pitch;
5102						source4 += pitch;
5103						source5 += pitch;
5104						source6 += pitch;
5105						source7 += pitch;
5106					}
5107				}
5108				else if(internal.depth == 16)
5109				{
5110					for(int y = 0; y < height; y++)
5111					{
5112						for(int x = 0; x < 2 * width; x++)
5113						{
5114							float c0 = *(float*)(source0 + 4 * x);
5115							float c1 = *(float*)(source1 + 4 * x);
5116							float c2 = *(float*)(source2 + 4 * x);
5117							float c3 = *(float*)(source3 + 4 * x);
5118							float c4 = *(float*)(source4 + 4 * x);
5119							float c5 = *(float*)(source5 + 4 * x);
5120							float c6 = *(float*)(source6 + 4 * x);
5121							float c7 = *(float*)(source7 + 4 * x);
5122							float c8 = *(float*)(source8 + 4 * x);
5123							float c9 = *(float*)(source9 + 4 * x);
5124							float cA = *(float*)(sourceA + 4 * x);
5125							float cB = *(float*)(sourceB + 4 * x);
5126							float cC = *(float*)(sourceC + 4 * x);
5127							float cD = *(float*)(sourceD + 4 * x);
5128							float cE = *(float*)(sourceE + 4 * x);
5129							float cF = *(float*)(sourceF + 4 * x);
5130
5131							c0 = c0 + c1;
5132							c2 = c2 + c3;
5133							c4 = c4 + c5;
5134							c6 = c6 + c7;
5135							c8 = c8 + c9;
5136							cA = cA + cB;
5137							cC = cC + cD;
5138							cE = cE + cF;
5139							c0 = c0 + c2;
5140							c4 = c4 + c6;
5141							c8 = c8 + cA;
5142							cC = cC + cE;
5143							c0 = c0 + c4;
5144							c8 = c8 + cC;
5145							c0 = c0 + c8;
5146							c0 *= 1.0f / 16.0f;
5147
5148							*(float*)(source0 + 4 * x) = c0;
5149						}
5150
5151						source0 += pitch;
5152						source1 += pitch;
5153						source2 += pitch;
5154						source3 += pitch;
5155						source4 += pitch;
5156						source5 += pitch;
5157						source6 += pitch;
5158						source7 += pitch;
5159						source8 += pitch;
5160						source9 += pitch;
5161						sourceA += pitch;
5162						sourceB += pitch;
5163						sourceC += pitch;
5164						sourceD += pitch;
5165						sourceE += pitch;
5166						sourceF += pitch;
5167					}
5168				}
5169				else ASSERT(false);
5170			}
5171		}
5172		else if(internal.format == FORMAT_A32B32G32R32F)
5173		{
5174			if(CPUID::supportsSSE())
5175			{
5176				if(internal.depth == 2)
5177				{
5178					for(int y = 0; y < height; y++)
5179					{
5180						for(int x = 0; x < width; x++)
5181						{
5182							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5183							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5184
5185							c0 = _mm_add_ps(c0, c1);
5186							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
5187
5188							_mm_store_ps((float*)(source0 + 16 * x), c0);
5189						}
5190
5191						source0 += pitch;
5192						source1 += pitch;
5193					}
5194				}
5195				else if(internal.depth == 4)
5196				{
5197					for(int y = 0; y < height; y++)
5198					{
5199						for(int x = 0; x < width; x++)
5200						{
5201							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5202							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5203							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5204							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5205
5206							c0 = _mm_add_ps(c0, c1);
5207							c2 = _mm_add_ps(c2, c3);
5208							c0 = _mm_add_ps(c0, c2);
5209							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
5210
5211							_mm_store_ps((float*)(source0 + 16 * x), c0);
5212						}
5213
5214						source0 += pitch;
5215						source1 += pitch;
5216						source2 += pitch;
5217						source3 += pitch;
5218					}
5219				}
5220				else if(internal.depth == 8)
5221				{
5222					for(int y = 0; y < height; y++)
5223					{
5224						for(int x = 0; x < width; x++)
5225						{
5226							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5227							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5228							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5229							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5230							__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
5231							__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
5232							__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
5233							__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
5234
5235							c0 = _mm_add_ps(c0, c1);
5236							c2 = _mm_add_ps(c2, c3);
5237							c4 = _mm_add_ps(c4, c5);
5238							c6 = _mm_add_ps(c6, c7);
5239							c0 = _mm_add_ps(c0, c2);
5240							c4 = _mm_add_ps(c4, c6);
5241							c0 = _mm_add_ps(c0, c4);
5242							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
5243
5244							_mm_store_ps((float*)(source0 + 16 * x), c0);
5245						}
5246
5247						source0 += pitch;
5248						source1 += pitch;
5249						source2 += pitch;
5250						source3 += pitch;
5251						source4 += pitch;
5252						source5 += pitch;
5253						source6 += pitch;
5254						source7 += pitch;
5255					}
5256				}
5257				else if(internal.depth == 16)
5258				{
5259					for(int y = 0; y < height; y++)
5260					{
5261						for(int x = 0; x < width; x++)
5262						{
5263							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5264							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5265							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5266							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5267							__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
5268							__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
5269							__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
5270							__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
5271							__m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
5272							__m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
5273							__m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
5274							__m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
5275							__m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
5276							__m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
5277							__m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
5278							__m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
5279
5280							c0 = _mm_add_ps(c0, c1);
5281							c2 = _mm_add_ps(c2, c3);
5282							c4 = _mm_add_ps(c4, c5);
5283							c6 = _mm_add_ps(c6, c7);
5284							c8 = _mm_add_ps(c8, c9);
5285							cA = _mm_add_ps(cA, cB);
5286							cC = _mm_add_ps(cC, cD);
5287							cE = _mm_add_ps(cE, cF);
5288							c0 = _mm_add_ps(c0, c2);
5289							c4 = _mm_add_ps(c4, c6);
5290							c8 = _mm_add_ps(c8, cA);
5291							cC = _mm_add_ps(cC, cE);
5292							c0 = _mm_add_ps(c0, c4);
5293							c8 = _mm_add_ps(c8, cC);
5294							c0 = _mm_add_ps(c0, c8);
5295							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5296
5297							_mm_store_ps((float*)(source0 + 16 * x), c0);
5298						}
5299
5300						source0 += pitch;
5301						source1 += pitch;
5302						source2 += pitch;
5303						source3 += pitch;
5304						source4 += pitch;
5305						source5 += pitch;
5306						source6 += pitch;
5307						source7 += pitch;
5308						source8 += pitch;
5309						source9 += pitch;
5310						sourceA += pitch;
5311						sourceB += pitch;
5312						sourceC += pitch;
5313						sourceD += pitch;
5314						sourceE += pitch;
5315						sourceF += pitch;
5316					}
5317				}
5318				else ASSERT(false);
5319			}
5320			else
5321			{
5322				if(internal.depth == 2)
5323				{
5324					for(int y = 0; y < height; y++)
5325					{
5326						for(int x = 0; x < 4 * width; x++)
5327						{
5328							float c0 = *(float*)(source0 + 4 * x);
5329							float c1 = *(float*)(source1 + 4 * x);
5330
5331							c0 = c0 + c1;
5332							c0 *= 1.0f / 2.0f;
5333
5334							*(float*)(source0 + 4 * x) = c0;
5335						}
5336
5337						source0 += pitch;
5338						source1 += pitch;
5339					}
5340				}
5341				else if(internal.depth == 4)
5342				{
5343					for(int y = 0; y < height; y++)
5344					{
5345						for(int x = 0; x < 4 * width; x++)
5346						{
5347							float c0 = *(float*)(source0 + 4 * x);
5348							float c1 = *(float*)(source1 + 4 * x);
5349							float c2 = *(float*)(source2 + 4 * x);
5350							float c3 = *(float*)(source3 + 4 * x);
5351
5352							c0 = c0 + c1;
5353							c2 = c2 + c3;
5354							c0 = c0 + c2;
5355							c0 *= 1.0f / 4.0f;
5356
5357							*(float*)(source0 + 4 * x) = c0;
5358						}
5359
5360						source0 += pitch;
5361						source1 += pitch;
5362						source2 += pitch;
5363						source3 += pitch;
5364					}
5365				}
5366				else if(internal.depth == 8)
5367				{
5368					for(int y = 0; y < height; y++)
5369					{
5370						for(int x = 0; x < 4 * width; x++)
5371						{
5372							float c0 = *(float*)(source0 + 4 * x);
5373							float c1 = *(float*)(source1 + 4 * x);
5374							float c2 = *(float*)(source2 + 4 * x);
5375							float c3 = *(float*)(source3 + 4 * x);
5376							float c4 = *(float*)(source4 + 4 * x);
5377							float c5 = *(float*)(source5 + 4 * x);
5378							float c6 = *(float*)(source6 + 4 * x);
5379							float c7 = *(float*)(source7 + 4 * x);
5380
5381							c0 = c0 + c1;
5382							c2 = c2 + c3;
5383							c4 = c4 + c5;
5384							c6 = c6 + c7;
5385							c0 = c0 + c2;
5386							c4 = c4 + c6;
5387							c0 = c0 + c4;
5388							c0 *= 1.0f / 8.0f;
5389
5390							*(float*)(source0 + 4 * x) = c0;
5391						}
5392
5393						source0 += pitch;
5394						source1 += pitch;
5395						source2 += pitch;
5396						source3 += pitch;
5397						source4 += pitch;
5398						source5 += pitch;
5399						source6 += pitch;
5400						source7 += pitch;
5401					}
5402				}
5403				else if(internal.depth == 16)
5404				{
5405					for(int y = 0; y < height; y++)
5406					{
5407						for(int x = 0; x < 4 * width; x++)
5408						{
5409							float c0 = *(float*)(source0 + 4 * x);
5410							float c1 = *(float*)(source1 + 4 * x);
5411							float c2 = *(float*)(source2 + 4 * x);
5412							float c3 = *(float*)(source3 + 4 * x);
5413							float c4 = *(float*)(source4 + 4 * x);
5414							float c5 = *(float*)(source5 + 4 * x);
5415							float c6 = *(float*)(source6 + 4 * x);
5416							float c7 = *(float*)(source7 + 4 * x);
5417							float c8 = *(float*)(source8 + 4 * x);
5418							float c9 = *(float*)(source9 + 4 * x);
5419							float cA = *(float*)(sourceA + 4 * x);
5420							float cB = *(float*)(sourceB + 4 * x);
5421							float cC = *(float*)(sourceC + 4 * x);
5422							float cD = *(float*)(sourceD + 4 * x);
5423							float cE = *(float*)(sourceE + 4 * x);
5424							float cF = *(float*)(sourceF + 4 * x);
5425
5426							c0 = c0 + c1;
5427							c2 = c2 + c3;
5428							c4 = c4 + c5;
5429							c6 = c6 + c7;
5430							c8 = c8 + c9;
5431							cA = cA + cB;
5432							cC = cC + cD;
5433							cE = cE + cF;
5434							c0 = c0 + c2;
5435							c4 = c4 + c6;
5436							c8 = c8 + cA;
5437							cC = cC + cE;
5438							c0 = c0 + c4;
5439							c8 = c8 + cC;
5440							c0 = c0 + c8;
5441							c0 *= 1.0f / 16.0f;
5442
5443							*(float*)(source0 + 4 * x) = c0;
5444						}
5445
5446						source0 += pitch;
5447						source1 += pitch;
5448						source2 += pitch;
5449						source3 += pitch;
5450						source4 += pitch;
5451						source5 += pitch;
5452						source6 += pitch;
5453						source7 += pitch;
5454						source8 += pitch;
5455						source9 += pitch;
5456						sourceA += pitch;
5457						sourceB += pitch;
5458						sourceC += pitch;
5459						sourceD += pitch;
5460						sourceE += pitch;
5461						sourceF += pitch;
5462					}
5463				}
5464				else ASSERT(false);
5465			}
5466		}
5467		else if(internal.format == FORMAT_R5G6B5)
5468		{
5469			if(CPUID::supportsSSE2() && (width % 8) == 0)
5470			{
5471				if(internal.depth == 2)
5472				{
5473					for(int y = 0; y < height; y++)
5474					{
5475						for(int x = 0; x < width; x += 8)
5476						{
5477							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5478							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5479
5480							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5481							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5482							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5483							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5484							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5485							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5486
5487							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5488							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5489							c1 = _mm_avg_epu16(c0__g_, c1__g_);
5490							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5491							c0 = _mm_or_si128(c0, c1);
5492
5493							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5494						}
5495
5496						source0 += pitch;
5497						source1 += pitch;
5498					}
5499				}
5500				else if(internal.depth == 4)
5501				{
5502					for(int y = 0; y < height; y++)
5503					{
5504						for(int x = 0; x < width; x += 8)
5505						{
5506							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5507							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5508							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5509							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5510
5511							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5512							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5513							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5514							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5515							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5516							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5517							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5518							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5519							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5520							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5521
5522							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5523							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5524							c0 = _mm_avg_epu8(c0, c2);
5525							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5526							c1 = _mm_avg_epu16(c0__g_, c1__g_);
5527							c3 = _mm_avg_epu16(c2__g_, c3__g_);
5528							c1 = _mm_avg_epu16(c1, c3);
5529							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5530							c0 = _mm_or_si128(c0, c1);
5531
5532							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5533						}
5534
5535						source0 += pitch;
5536						source1 += pitch;
5537						source2 += pitch;
5538						source3 += pitch;
5539					}
5540				}
5541				else if(internal.depth == 8)
5542				{
5543					for(int y = 0; y < height; y++)
5544					{
5545						for(int x = 0; x < width; x += 8)
5546						{
5547							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5548							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5549							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5550							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5551							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
5552							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
5553							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
5554							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
5555
5556							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5557							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5558							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5559							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5560							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5561							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5562							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5563							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5564							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5565							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5566							__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
5567							__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
5568							__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
5569							__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
5570							__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
5571							__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
5572							__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
5573							__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
5574
5575							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5576							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5577							c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
5578							c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
5579							c0 = _mm_avg_epu8(c0, c2);
5580							c4 = _mm_avg_epu8(c4, c6);
5581							c0 = _mm_avg_epu8(c0, c4);
5582							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5583							c1 = _mm_avg_epu16(c0__g_, c1__g_);
5584							c3 = _mm_avg_epu16(c2__g_, c3__g_);
5585							c5 = _mm_avg_epu16(c4__g_, c5__g_);
5586							c7 = _mm_avg_epu16(c6__g_, c7__g_);
5587							c1 = _mm_avg_epu16(c1, c3);
5588							c5 = _mm_avg_epu16(c5, c7);
5589							c1 = _mm_avg_epu16(c1, c5);
5590							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5591							c0 = _mm_or_si128(c0, c1);
5592
5593							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5594						}
5595
5596						source0 += pitch;
5597						source1 += pitch;
5598						source2 += pitch;
5599						source3 += pitch;
5600						source4 += pitch;
5601						source5 += pitch;
5602						source6 += pitch;
5603						source7 += pitch;
5604					}
5605				}
5606				else if(internal.depth == 16)
5607				{
5608					for(int y = 0; y < height; y++)
5609					{
5610						for(int x = 0; x < width; x += 8)
5611						{
5612							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5613							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5614							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5615							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5616							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
5617							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
5618							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
5619							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
5620							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 2 * x));
5621							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 2 * x));
5622							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 2 * x));
5623							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 2 * x));
5624							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 2 * x));
5625							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 2 * x));
5626							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 2 * x));
5627							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 2 * x));
5628
5629							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5630							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5631							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5632							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5633							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5634							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5635							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5636							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5637							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5638							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5639							__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
5640							__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
5641							__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
5642							__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
5643							__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
5644							__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
5645							__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
5646							__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
5647							__m128i c8_r_b = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(r_b));
5648							__m128i c8__g_ = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(_g_));
5649							__m128i c9_r_b = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(r_b));
5650							__m128i c9__g_ = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(_g_));
5651							__m128i cA_r_b = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(r_b));
5652							__m128i cA__g_ = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(_g_));
5653							__m128i cB_r_b = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(r_b));
5654							__m128i cB__g_ = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(_g_));
5655							__m128i cC_r_b = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(r_b));
5656							__m128i cC__g_ = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(_g_));
5657							__m128i cD_r_b = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(r_b));
5658							__m128i cD__g_ = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(_g_));
5659							__m128i cE_r_b = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(r_b));
5660							__m128i cE__g_ = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(_g_));
5661							__m128i cF_r_b = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(r_b));
5662							__m128i cF__g_ = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(_g_));
5663
5664							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5665							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5666							c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
5667							c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
5668							c8 = _mm_avg_epu8(c8_r_b, c9_r_b);
5669							cA = _mm_avg_epu8(cA_r_b, cB_r_b);
5670							cC = _mm_avg_epu8(cC_r_b, cD_r_b);
5671							cE = _mm_avg_epu8(cE_r_b, cF_r_b);
5672							c0 = _mm_avg_epu8(c0, c2);
5673							c4 = _mm_avg_epu8(c4, c6);
5674							c8 = _mm_avg_epu8(c8, cA);
5675							cC = _mm_avg_epu8(cC, cE);
5676							c0 = _mm_avg_epu8(c0, c4);
5677							c8 = _mm_avg_epu8(c8, cC);
5678							c0 = _mm_avg_epu8(c0, c8);
5679							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5680							c1 = _mm_avg_epu16(c0__g_, c1__g_);
5681							c3 = _mm_avg_epu16(c2__g_, c3__g_);
5682							c5 = _mm_avg_epu16(c4__g_, c5__g_);
5683							c7 = _mm_avg_epu16(c6__g_, c7__g_);
5684							c9 = _mm_avg_epu16(c8__g_, c9__g_);
5685							cB = _mm_avg_epu16(cA__g_, cB__g_);
5686							cD = _mm_avg_epu16(cC__g_, cD__g_);
5687							cF = _mm_avg_epu16(cE__g_, cF__g_);
5688							c1 = _mm_avg_epu8(c1, c3);
5689							c5 = _mm_avg_epu8(c5, c7);
5690							c9 = _mm_avg_epu8(c9, cB);
5691							cD = _mm_avg_epu8(cD, cF);
5692							c1 = _mm_avg_epu8(c1, c5);
5693							c9 = _mm_avg_epu8(c9, cD);
5694							c1 = _mm_avg_epu8(c1, c9);
5695							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5696							c0 = _mm_or_si128(c0, c1);
5697
5698							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5699						}
5700
5701						source0 += pitch;
5702						source1 += pitch;
5703						source2 += pitch;
5704						source3 += pitch;
5705						source4 += pitch;
5706						source5 += pitch;
5707						source6 += pitch;
5708						source7 += pitch;
5709						source8 += pitch;
5710						source9 += pitch;
5711						sourceA += pitch;
5712						sourceB += pitch;
5713						sourceC += pitch;
5714						sourceD += pitch;
5715						sourceE += pitch;
5716						sourceF += pitch;
5717					}
5718				}
5719				else ASSERT(false);
5720			}
5721			else
5722			{
5723				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7BEF) + (((x) ^ (y)) & 0x0821))
5724
5725				if(internal.depth == 2)
5726				{
5727					for(int y = 0; y < height; y++)
5728					{
5729						for(int x = 0; x < width; x++)
5730						{
5731							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5732							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5733
5734							c0 = AVERAGE(c0, c1);
5735
5736							*(unsigned short*)(source0 + 2 * x) = c0;
5737						}
5738
5739						source0 += pitch;
5740						source1 += pitch;
5741					}
5742				}
5743				else if(internal.depth == 4)
5744				{
5745					for(int y = 0; y < height; y++)
5746					{
5747						for(int x = 0; x < width; x++)
5748						{
5749							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5750							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5751							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
5752							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
5753
5754							c0 = AVERAGE(c0, c1);
5755							c2 = AVERAGE(c2, c3);
5756							c0 = AVERAGE(c0, c2);
5757
5758							*(unsigned short*)(source0 + 2 * x) = c0;
5759						}
5760
5761						source0 += pitch;
5762						source1 += pitch;
5763						source2 += pitch;
5764						source3 += pitch;
5765					}
5766				}
5767				else if(internal.depth == 8)
5768				{
5769					for(int y = 0; y < height; y++)
5770					{
5771						for(int x = 0; x < width; x++)
5772						{
5773							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5774							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5775							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
5776							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
5777							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
5778							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
5779							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
5780							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
5781
5782							c0 = AVERAGE(c0, c1);
5783							c2 = AVERAGE(c2, c3);
5784							c4 = AVERAGE(c4, c5);
5785							c6 = AVERAGE(c6, c7);
5786							c0 = AVERAGE(c0, c2);
5787							c4 = AVERAGE(c4, c6);
5788							c0 = AVERAGE(c0, c4);
5789
5790							*(unsigned short*)(source0 + 2 * x) = c0;
5791						}
5792
5793						source0 += pitch;
5794						source1 += pitch;
5795						source2 += pitch;
5796						source3 += pitch;
5797						source4 += pitch;
5798						source5 += pitch;
5799						source6 += pitch;
5800						source7 += pitch;
5801					}
5802				}
5803				else if(internal.depth == 16)
5804				{
5805					for(int y = 0; y < height; y++)
5806					{
5807						for(int x = 0; x < width; x++)
5808						{
5809							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5810							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5811							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
5812							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
5813							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
5814							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
5815							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
5816							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
5817							unsigned short c8 = *(unsigned short*)(source8 + 2 * x);
5818							unsigned short c9 = *(unsigned short*)(source9 + 2 * x);
5819							unsigned short cA = *(unsigned short*)(sourceA + 2 * x);
5820							unsigned short cB = *(unsigned short*)(sourceB + 2 * x);
5821							unsigned short cC = *(unsigned short*)(sourceC + 2 * x);
5822							unsigned short cD = *(unsigned short*)(sourceD + 2 * x);
5823							unsigned short cE = *(unsigned short*)(sourceE + 2 * x);
5824							unsigned short cF = *(unsigned short*)(sourceF + 2 * x);
5825
5826							c0 = AVERAGE(c0, c1);
5827							c2 = AVERAGE(c2, c3);
5828							c4 = AVERAGE(c4, c5);
5829							c6 = AVERAGE(c6, c7);
5830							c8 = AVERAGE(c8, c9);
5831							cA = AVERAGE(cA, cB);
5832							cC = AVERAGE(cC, cD);
5833							cE = AVERAGE(cE, cF);
5834							c0 = AVERAGE(c0, c2);
5835							c4 = AVERAGE(c4, c6);
5836							c8 = AVERAGE(c8, cA);
5837							cC = AVERAGE(cC, cE);
5838							c0 = AVERAGE(c0, c4);
5839							c8 = AVERAGE(c8, cC);
5840							c0 = AVERAGE(c0, c8);
5841
5842							*(unsigned short*)(source0 + 2 * x) = c0;
5843						}
5844
5845						source0 += pitch;
5846						source1 += pitch;
5847						source2 += pitch;
5848						source3 += pitch;
5849						source4 += pitch;
5850						source5 += pitch;
5851						source6 += pitch;
5852						source7 += pitch;
5853						source8 += pitch;
5854						source9 += pitch;
5855						sourceA += pitch;
5856						sourceB += pitch;
5857						sourceC += pitch;
5858						sourceD += pitch;
5859						sourceE += pitch;
5860						sourceF += pitch;
5861					}
5862				}
5863				else ASSERT(false);
5864
5865				#undef AVERAGE
5866			}
5867		}
5868		else
5869		{
5870		//	UNIMPLEMENTED();
5871		}
5872	}
5873}
5874