Surface.cpp revision dbd1a8e6cb781672840a5f7d53d162247a4dbe98
1// SwiftShader Software Renderer
2//
3// Copyright(c) 2005-2013 TransGaming Inc.
4//
5// All rights reserved. No part of this software may be copied, distributed, transmitted,
6// transcribed, stored in a retrieval system, translated into any human or computer
7// language by any means, or disclosed to third parties without the explicit written
8// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
9// or implied, including but not limited to any patent rights, are granted to you.
10//
11
12#include "Surface.hpp"
13
14#include "Color.hpp"
15#include "Context.hpp"
16#include "ETC_Decoder.hpp"
17#include "Renderer.hpp"
18#include "Common/Half.hpp"
19#include "Common/Memory.hpp"
20#include "Common/CPUID.hpp"
21#include "Common/Resource.hpp"
22#include "Common/Debug.hpp"
23#include "Reactor/Reactor.hpp"
24
25#include <xmmintrin.h>
26#include <emmintrin.h>
27
28#undef min
29#undef max
30
31namespace sw
32{
33	extern bool quadLayoutEnabled;
34	extern bool complementaryDepthBuffer;
35	extern TranscendentalPrecision logPrecision;
36
37	unsigned int *Surface::palette = 0;
38	unsigned int Surface::paletteID = 0;
39
40	void Rect::clip(int minX, int minY, int maxX, int maxY)
41	{
42		x0 = clamp(x0, minX, maxX);
43		y0 = clamp(y0, minY, maxY);
44		x1 = clamp(x1, minX, maxX);
45		y1 = clamp(y1, minY, maxY);
46	}
47
48	void Surface::Buffer::write(int x, int y, int z, const Color<float> &color)
49	{
50		void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
51
52		write(element, color);
53	}
54
55	void Surface::Buffer::write(int x, int y, const Color<float> &color)
56	{
57		void *element = (unsigned char*)buffer + x * bytes + y * pitchB;
58
59		write(element, color);
60	}
61
62	inline void Surface::Buffer::write(void *element, const Color<float> &color)
63	{
64		switch(format)
65		{
66		case FORMAT_A8:
67			*(unsigned char*)element = unorm<8>(color.a);
68			break;
69		case FORMAT_R8I_SNORM:
70			*(char*)element = snorm<8>(color.r);
71			break;
72		case FORMAT_R8:
73			*(unsigned char*)element = unorm<8>(color.r);
74			break;
75		case FORMAT_R8I:
76			*(char*)element = scast<8>(color.r);
77			break;
78		case FORMAT_R8UI:
79			*(unsigned char*)element = ucast<8>(color.r);
80			break;
81		case FORMAT_R16I:
82			*(short*)element = scast<16>(color.r);
83			break;
84		case FORMAT_R16UI:
85			*(unsigned short*)element = ucast<16>(color.r);
86			break;
87		case FORMAT_R32I:
88			*(int*)element = static_cast<int>(color.r);
89			break;
90		case FORMAT_R32UI:
91			*(unsigned int*)element = static_cast<unsigned int>(color.r);
92			break;
93		case FORMAT_R3G3B2:
94			*(unsigned char*)element = (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
95			break;
96		case FORMAT_A8R3G3B2:
97			*(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
98			break;
99		case FORMAT_X4R4G4B4:
100			*(unsigned short*)element = 0xF000 | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
101			break;
102		case FORMAT_A4R4G4B4:
103			*(unsigned short*)element = (unorm<4>(color.a) << 12) | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
104			break;
105		case FORMAT_R4G4B4A4:
106			*(unsigned short*)element = (unorm<4>(color.r) << 12) | (unorm<4>(color.g) << 8) | (unorm<4>(color.b) << 4) | (unorm<4>(color.a) << 0);
107			break;
108		case FORMAT_R5G6B5:
109			*(unsigned short*)element = (unorm<5>(color.r) << 11) | (unorm<6>(color.g) << 5) | (unorm<5>(color.b) << 0);
110			break;
111		case FORMAT_A1R5G5B5:
112			*(unsigned short*)element = (unorm<1>(color.a) << 15) | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
113			break;
114		case FORMAT_R5G5B5A1:
115			*(unsigned short*)element = (unorm<5>(color.r) << 11) | (unorm<5>(color.g) << 6) | (unorm<5>(color.b) << 1) | (unorm<5>(color.a) << 0);
116			break;
117		case FORMAT_X1R5G5B5:
118			*(unsigned short*)element = 0x8000 | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
119			break;
120		case FORMAT_A8R8G8B8:
121			*(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
122			break;
123		case FORMAT_X8R8G8B8:
124			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
125			break;
126		case FORMAT_A8B8G8R8I_SNORM:
127			*(unsigned int*)element = (static_cast<unsigned int>(snorm<8>(color.a)) << 24) |
128			                          (static_cast<unsigned int>(snorm<8>(color.b)) << 16) |
129			                          (static_cast<unsigned int>(snorm<8>(color.g)) << 8) |
130			                          (static_cast<unsigned int>(snorm<8>(color.r)) << 0);
131			break;
132		case FORMAT_A8B8G8R8:
133			*(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
134			break;
135		case FORMAT_A8B8G8R8I:
136			*(unsigned int*)element = (static_cast<unsigned int>(scast<8>(color.a)) << 24) |
137			                          (static_cast<unsigned int>(scast<8>(color.b)) << 16) |
138			                          (static_cast<unsigned int>(scast<8>(color.g)) << 8) |
139			                          (static_cast<unsigned int>(scast<8>(color.r)) << 0);
140			break;
141		case FORMAT_A8B8G8R8UI:
142			*(unsigned int*)element = (ucast<8>(color.a) << 24) | (ucast<8>(color.b) << 16) | (ucast<8>(color.g) << 8) | (ucast<8>(color.r) << 0);
143			break;
144		case FORMAT_X8B8G8R8I_SNORM:
145			*(unsigned int*)element = 0x7F000000 |
146			                          (static_cast<unsigned int>(snorm<8>(color.b)) << 16) |
147			                          (static_cast<unsigned int>(snorm<8>(color.g)) << 8) |
148			                          (static_cast<unsigned int>(snorm<8>(color.r)) << 0);
149			break;
150		case FORMAT_X8B8G8R8:
151			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
152			break;
153		case FORMAT_X8B8G8R8I:
154			*(unsigned int*)element = 0x7F000000 |
155			                          (static_cast<unsigned int>(scast<8>(color.b)) << 16) |
156			                          (static_cast<unsigned int>(scast<8>(color.g)) << 8) |
157			                          (static_cast<unsigned int>(scast<8>(color.r)) << 0);
158		case FORMAT_X8B8G8R8UI:
159			*(unsigned int*)element = 0xFF000000 | (ucast<8>(color.b) << 16) | (ucast<8>(color.g) << 8) | (ucast<8>(color.r) << 0);
160			break;
161		case FORMAT_A2R10G10B10:
162			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.r) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.b) << 0);
163			break;
164		case FORMAT_A2B10G10R10:
165			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.b) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.r) << 0);
166			break;
167		case FORMAT_G8R8I_SNORM:
168			*(unsigned short*)element = (static_cast<unsigned short>(snorm<8>(color.g)) << 8) |
169			                            (static_cast<unsigned short>(snorm<8>(color.r)) << 0);
170			break;
171		case FORMAT_G8R8:
172			*(unsigned short*)element = (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
173			break;
174		case FORMAT_G8R8I:
175			*(unsigned short*)element = (static_cast<unsigned short>(scast<8>(color.g)) << 8) |
176			                            (static_cast<unsigned short>(scast<8>(color.r)) << 0);
177			break;
178		case FORMAT_G8R8UI:
179			*(unsigned short*)element = (ucast<8>(color.g) << 8) | (ucast<8>(color.r) << 0);
180			break;
181		case FORMAT_G16R16:
182			*(unsigned int*)element = (unorm<16>(color.g) << 16) | (unorm<16>(color.r) << 0);
183			break;
184		case FORMAT_G16R16I:
185			*(unsigned int*)element = (static_cast<unsigned int>(scast<16>(color.g)) << 16) |
186			                          (static_cast<unsigned int>(scast<16>(color.r)) << 0);
187			break;
188		case FORMAT_G16R16UI:
189			*(unsigned int*)element = (ucast<16>(color.g) << 16) | (ucast<16>(color.r) << 0);
190			break;
191		case FORMAT_G32R32I:
192		case FORMAT_G32R32UI:
193			((unsigned int*)element)[0] = static_cast<unsigned int>(color.r);
194			((unsigned int*)element)[1] = static_cast<unsigned int>(color.g);
195			break;
196		case FORMAT_A16B16G16R16:
197			((unsigned short*)element)[0] = unorm<16>(color.r);
198			((unsigned short*)element)[1] = unorm<16>(color.g);
199			((unsigned short*)element)[2] = unorm<16>(color.b);
200			((unsigned short*)element)[3] = unorm<16>(color.a);
201			break;
202		case FORMAT_A16B16G16R16I:
203			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(color.r));
204			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(color.g));
205			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(color.b));
206			((unsigned short*)element)[3] = static_cast<unsigned short>(scast<16>(color.a));
207			break;
208		case FORMAT_A16B16G16R16UI:
209			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(color.r));
210			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(color.g));
211			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(color.b));
212			((unsigned short*)element)[3] = static_cast<unsigned short>(ucast<16>(color.a));
213			break;
214		case FORMAT_X16B16G16R16I:
215			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(color.r));
216			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(color.g));
217			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(color.b));
218			break;
219		case FORMAT_X16B16G16R16UI:
220			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(color.r));
221			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(color.g));
222			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(color.b));
223			break;
224		case FORMAT_A32B32G32R32I:
225		case FORMAT_A32B32G32R32UI:
226			((unsigned int*)element)[0] = static_cast<unsigned int>(color.r);
227			((unsigned int*)element)[1] = static_cast<unsigned int>(color.g);
228			((unsigned int*)element)[2] = static_cast<unsigned int>(color.b);
229			((unsigned int*)element)[3] = static_cast<unsigned int>(color.a);
230			break;
231		case FORMAT_X32B32G32R32I:
232		case FORMAT_X32B32G32R32UI:
233			((unsigned int*)element)[0] = static_cast<unsigned int>(color.r);
234			((unsigned int*)element)[1] = static_cast<unsigned int>(color.g);
235			((unsigned int*)element)[2] = static_cast<unsigned int>(color.b);
236			break;
237		case FORMAT_V8U8:
238			*(unsigned short*)element = (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
239			break;
240		case FORMAT_L6V5U5:
241			*(unsigned short*)element = (unorm<6>(color.b) << 10) | (snorm<5>(color.g) << 5) | (snorm<5>(color.r) << 0);
242			break;
243		case FORMAT_Q8W8V8U8:
244			*(unsigned int*)element = (snorm<8>(color.a) << 24) | (snorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
245			break;
246		case FORMAT_X8L8V8U8:
247			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
248			break;
249		case FORMAT_V16U16:
250			*(unsigned int*)element = (snorm<16>(color.g) << 16) | (snorm<16>(color.r) << 0);
251			break;
252		case FORMAT_A2W10V10U10:
253			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (snorm<10>(color.b) << 20) | (snorm<10>(color.g) << 10) | (snorm<10>(color.r) << 0);
254			break;
255		case FORMAT_A16W16V16U16:
256			((unsigned short*)element)[0] = snorm<16>(color.r);
257			((unsigned short*)element)[1] = snorm<16>(color.g);
258			((unsigned short*)element)[2] = snorm<16>(color.b);
259			((unsigned short*)element)[3] = unorm<16>(color.a);
260			break;
261		case FORMAT_Q16W16V16U16:
262			((unsigned short*)element)[0] = snorm<16>(color.r);
263			((unsigned short*)element)[1] = snorm<16>(color.g);
264			((unsigned short*)element)[2] = snorm<16>(color.b);
265			((unsigned short*)element)[3] = snorm<16>(color.a);
266			break;
267		case FORMAT_R8G8B8:
268			((unsigned char*)element)[0] = unorm<8>(color.b);
269			((unsigned char*)element)[1] = unorm<8>(color.g);
270			((unsigned char*)element)[2] = unorm<8>(color.r);
271			break;
272		case FORMAT_B8G8R8:
273			((unsigned char*)element)[0] = unorm<8>(color.r);
274			((unsigned char*)element)[1] = unorm<8>(color.g);
275			((unsigned char*)element)[2] = unorm<8>(color.b);
276			break;
277		case FORMAT_R16F:
278			*(half*)element = (half)color.r;
279			break;
280		case FORMAT_A16F:
281			*(half*)element = (half)color.a;
282			break;
283		case FORMAT_G16R16F:
284			((half*)element)[0] = (half)color.r;
285			((half*)element)[1] = (half)color.g;
286			break;
287		case FORMAT_B16G16R16F:
288			((half*)element)[0] = (half)color.r;
289			((half*)element)[1] = (half)color.g;
290			((half*)element)[2] = (half)color.b;
291			break;
292		case FORMAT_A16B16G16R16F:
293			((half*)element)[0] = (half)color.r;
294			((half*)element)[1] = (half)color.g;
295			((half*)element)[2] = (half)color.b;
296			((half*)element)[3] = (half)color.a;
297			break;
298		case FORMAT_A32F:
299			*(float*)element = color.a;
300			break;
301		case FORMAT_R32F:
302			*(float*)element = color.r;
303			break;
304		case FORMAT_G32R32F:
305			((float*)element)[0] = color.r;
306			((float*)element)[1] = color.g;
307			break;
308		case FORMAT_X32B32G32R32F:
309			((float*)element)[3] = 1.0f;
310		case FORMAT_B32G32R32F:
311			((float*)element)[0] = color.r;
312			((float*)element)[1] = color.g;
313			((float*)element)[2] = color.b;
314			break;
315		case FORMAT_A32B32G32R32F:
316			((float*)element)[0] = color.r;
317			((float*)element)[1] = color.g;
318			((float*)element)[2] = color.b;
319			((float*)element)[3] = color.a;
320			break;
321		case FORMAT_D32F:
322		case FORMAT_D32F_LOCKABLE:
323		case FORMAT_D32FS8_TEXTURE:
324		case FORMAT_D32FS8_SHADOW:
325			*((float*)element) = color.r;
326			break;
327		case FORMAT_D32F_COMPLEMENTARY:
328			*((float*)element) = 1 - color.r;
329			break;
330		case FORMAT_S8:
331			*((unsigned char*)element) = unorm<8>(color.r);
332			break;
333		case FORMAT_L8:
334			*(unsigned char*)element = unorm<8>(color.r);
335			break;
336		case FORMAT_A4L4:
337			*(unsigned char*)element = (unorm<4>(color.a) << 4) | (unorm<4>(color.r) << 0);
338			break;
339		case FORMAT_L16:
340			*(unsigned short*)element = unorm<16>(color.r);
341			break;
342		case FORMAT_A8L8:
343			*(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<8>(color.r) << 0);
344			break;
345		case FORMAT_L16F:
346			*(half*)element = (half)color.r;
347			break;
348		case FORMAT_A16L16F:
349			((half*)element)[0] = (half)color.r;
350			((half*)element)[1] = (half)color.a;
351			break;
352		case FORMAT_L32F:
353			*(float*)element = color.r;
354			break;
355		case FORMAT_A32L32F:
356			((float*)element)[0] = color.r;
357			((float*)element)[1] = color.a;
358			break;
359		default:
360			ASSERT(false);
361		}
362	}
363
364	Color<float> Surface::Buffer::read(int x, int y, int z) const
365	{
366		void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
367
368		return read(element);
369	}
370
371	Color<float> Surface::Buffer::read(int x, int y) const
372	{
373		void *element = (unsigned char*)buffer + x * bytes + y * pitchB;
374
375		return read(element);
376	}
377
378	inline Color<float> Surface::Buffer::read(void *element) const
379	{
380		float r = 0.0f;
381		float g = 0.0f;
382		float b = 0.0f;
383		float a = 1.0f;
384
385		switch(format)
386		{
387		case FORMAT_P8:
388			{
389				ASSERT(palette);
390
391				unsigned int abgr = palette[*(unsigned char*)element];
392
393				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
394				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
395				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
396				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
397			}
398			break;
399		case FORMAT_A8P8:
400			{
401				ASSERT(palette);
402
403				unsigned int bgr = palette[((unsigned char*)element)[0]];
404
405				r = (bgr & 0x000000FF) * (1.0f / 0x000000FF);
406				g = (bgr & 0x0000FF00) * (1.0f / 0x0000FF00);
407				b = (bgr & 0x00FF0000) * (1.0f / 0x00FF0000);
408				a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
409			}
410			break;
411		case FORMAT_A8:
412			r = 0;
413			g = 0;
414			b = 0;
415			a = *(unsigned char*)element * (1.0f / 0xFF);
416			break;
417		case FORMAT_R8I_SNORM:
418			r = max((*(signed char*)element) * (1.0f / 0x7F), -1.0f);
419			break;
420		case FORMAT_R8:
421			r = *(unsigned char*)element * (1.0f / 0xFF);
422			break;
423		case FORMAT_R8I:
424			r = *(signed char*)element;
425			break;
426		case FORMAT_R8UI:
427			r = *(unsigned char*)element;
428			break;
429		case FORMAT_R3G3B2:
430			{
431				unsigned char rgb = *(unsigned char*)element;
432
433				r = (rgb & 0xE0) * (1.0f / 0xE0);
434				g = (rgb & 0x1C) * (1.0f / 0x1C);
435				b = (rgb & 0x03) * (1.0f / 0x03);
436			}
437			break;
438		case FORMAT_A8R3G3B2:
439			{
440				unsigned short argb = *(unsigned short*)element;
441
442				a = (argb & 0xFF00) * (1.0f / 0xFF00);
443				r = (argb & 0x00E0) * (1.0f / 0x00E0);
444				g = (argb & 0x001C) * (1.0f / 0x001C);
445				b = (argb & 0x0003) * (1.0f / 0x0003);
446			}
447			break;
448		case FORMAT_X4R4G4B4:
449			{
450				unsigned short rgb = *(unsigned short*)element;
451
452				r = (rgb & 0x0F00) * (1.0f / 0x0F00);
453				g = (rgb & 0x00F0) * (1.0f / 0x00F0);
454				b = (rgb & 0x000F) * (1.0f / 0x000F);
455			}
456			break;
457		case FORMAT_A4R4G4B4:
458			{
459				unsigned short argb = *(unsigned short*)element;
460
461				a = (argb & 0xF000) * (1.0f / 0xF000);
462				r = (argb & 0x0F00) * (1.0f / 0x0F00);
463				g = (argb & 0x00F0) * (1.0f / 0x00F0);
464				b = (argb & 0x000F) * (1.0f / 0x000F);
465			}
466			break;
467		case FORMAT_R4G4B4A4:
468			{
469				unsigned short rgba = *(unsigned short*)element;
470
471				r = (rgba & 0xF000) * (1.0f / 0xF000);
472				g = (rgba & 0x0F00) * (1.0f / 0x0F00);
473				b = (rgba & 0x00F0) * (1.0f / 0x00F0);
474				a = (rgba & 0x000F) * (1.0f / 0x000F);
475			}
476			break;
477		case FORMAT_R5G6B5:
478			{
479				unsigned short rgb = *(unsigned short*)element;
480
481				r = (rgb & 0xF800) * (1.0f / 0xF800);
482				g = (rgb & 0x07E0) * (1.0f / 0x07E0);
483				b = (rgb & 0x001F) * (1.0f / 0x001F);
484			}
485			break;
486		case FORMAT_A1R5G5B5:
487			{
488				unsigned short argb = *(unsigned short*)element;
489
490				a = (argb & 0x8000) * (1.0f / 0x8000);
491				r = (argb & 0x7C00) * (1.0f / 0x7C00);
492				g = (argb & 0x03E0) * (1.0f / 0x03E0);
493				b = (argb & 0x001F) * (1.0f / 0x001F);
494			}
495			break;
496		case FORMAT_R5G5B5A1:
497			{
498				unsigned short rgba = *(unsigned short*)element;
499
500				r = (rgba & 0xF800) * (1.0f / 0xF800);
501				g = (rgba & 0x07C0) * (1.0f / 0x07C0);
502				b = (rgba & 0x003E) * (1.0f / 0x003E);
503				a = (rgba & 0x0001) * (1.0f / 0x0001);
504			}
505			break;
506		case FORMAT_X1R5G5B5:
507			{
508				unsigned short xrgb = *(unsigned short*)element;
509
510				r = (xrgb & 0x7C00) * (1.0f / 0x7C00);
511				g = (xrgb & 0x03E0) * (1.0f / 0x03E0);
512				b = (xrgb & 0x001F) * (1.0f / 0x001F);
513			}
514			break;
515		case FORMAT_A8R8G8B8:
516			{
517				unsigned int argb = *(unsigned int*)element;
518
519				a = (argb & 0xFF000000) * (1.0f / 0xFF000000);
520				r = (argb & 0x00FF0000) * (1.0f / 0x00FF0000);
521				g = (argb & 0x0000FF00) * (1.0f / 0x0000FF00);
522				b = (argb & 0x000000FF) * (1.0f / 0x000000FF);
523			}
524			break;
525		case FORMAT_X8R8G8B8:
526			{
527				unsigned int xrgb = *(unsigned int*)element;
528
529				r = (xrgb & 0x00FF0000) * (1.0f / 0x00FF0000);
530				g = (xrgb & 0x0000FF00) * (1.0f / 0x0000FF00);
531				b = (xrgb & 0x000000FF) * (1.0f / 0x000000FF);
532			}
533			break;
534		case FORMAT_A8B8G8R8I_SNORM:
535			{
536				signed char* abgr = (signed char*)element;
537
538				r = max(abgr[0] * (1.0f / 0x7F), -1.0f);
539				g = max(abgr[1] * (1.0f / 0x7F), -1.0f);
540				b = max(abgr[2] * (1.0f / 0x7F), -1.0f);
541				a = max(abgr[3] * (1.0f / 0x7F), -1.0f);
542			}
543			break;
544		case FORMAT_A8B8G8R8:
545			{
546				unsigned int abgr = *(unsigned int*)element;
547
548				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
549				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
550				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
551				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
552			}
553			break;
554		case FORMAT_A8B8G8R8I:
555			{
556				signed char* abgr = (signed char*)element;
557
558				r = abgr[0];
559				g = abgr[1];
560				b = abgr[2];
561				a = abgr[3];
562			}
563			break;
564		case FORMAT_A8B8G8R8UI:
565			{
566				unsigned char* abgr = (unsigned char*)element;
567
568				r = abgr[0];
569				g = abgr[1];
570				b = abgr[2];
571				a = abgr[3];
572			}
573			break;
574		case FORMAT_X8B8G8R8I_SNORM:
575			{
576				signed char* bgr = (signed char*)element;
577
578				r = max(bgr[0] * (1.0f / 0x7F), -1.0f);
579				g = max(bgr[1] * (1.0f / 0x7F), -1.0f);
580				b = max(bgr[2] * (1.0f / 0x7F), -1.0f);
581			}
582			break;
583		case FORMAT_X8B8G8R8:
584			{
585				unsigned int xbgr = *(unsigned int*)element;
586
587				b = (xbgr & 0x00FF0000) * (1.0f / 0x00FF0000);
588				g = (xbgr & 0x0000FF00) * (1.0f / 0x0000FF00);
589				r = (xbgr & 0x000000FF) * (1.0f / 0x000000FF);
590			}
591			break;
592		case FORMAT_X8B8G8R8I:
593			{
594				signed char* bgr = (signed char*)element;
595
596				r = bgr[0];
597				g = bgr[1];
598				b = bgr[2];
599			}
600			break;
601		case FORMAT_X8B8G8R8UI:
602			{
603				unsigned char* bgr = (unsigned char*)element;
604
605				r = bgr[0];
606				g = bgr[1];
607				b = bgr[2];
608			}
609			break;
610		case FORMAT_G8R8I_SNORM:
611			{
612				signed char* gr = (signed char*)element;
613
614				r = (gr[0] & 0xFF00) * (1.0f / 0xFF00);
615				g = (gr[1] & 0x00FF) * (1.0f / 0x00FF);
616			}
617			break;
618		case FORMAT_G8R8:
619			{
620				unsigned short gr = *(unsigned short*)element;
621
622				g = (gr & 0xFF00) * (1.0f / 0xFF00);
623				r = (gr & 0x00FF) * (1.0f / 0x00FF);
624			}
625			break;
626		case FORMAT_G8R8I:
627			{
628				signed char* gr = (signed char*)element;
629
630				r = gr[0];
631				g = gr[1];
632			}
633			break;
634		case FORMAT_G8R8UI:
635			{
636				unsigned char* gr = (unsigned char*)element;
637
638				r = gr[0];
639				g = gr[1];
640			}
641			break;
642		case FORMAT_R16I:
643			r = *((short*)element);
644			break;
645		case FORMAT_R16UI:
646			r = *((unsigned short*)element);
647			break;
648		case FORMAT_G16R16I:
649			{
650				short* gr = (short*)element;
651
652				r = gr[0];
653				g = gr[1];
654			}
655			break;
656		case FORMAT_G16R16:
657			{
658				unsigned int gr = *(unsigned int*)element;
659
660				g = (gr & 0xFFFF0000) * (1.0f / 0xFFFF0000);
661				r = (gr & 0x0000FFFF) * (1.0f / 0x0000FFFF);
662			}
663			break;
664		case FORMAT_G16R16UI:
665			{
666				unsigned short* gr = (unsigned short*)element;
667
668				r = gr[0];
669				g = gr[1];
670			}
671			break;
672		case FORMAT_A2R10G10B10:
673			{
674				unsigned int argb = *(unsigned int*)element;
675
676				a = (argb & 0xC0000000) * (1.0f / 0xC0000000);
677				r = (argb & 0x3FF00000) * (1.0f / 0x3FF00000);
678				g = (argb & 0x000FFC00) * (1.0f / 0x000FFC00);
679				b = (argb & 0x000003FF) * (1.0f / 0x000003FF);
680			}
681			break;
682		case FORMAT_A2B10G10R10:
683			{
684				unsigned int abgr = *(unsigned int*)element;
685
686				a = (abgr & 0xC0000000) * (1.0f / 0xC0000000);
687				b = (abgr & 0x3FF00000) * (1.0f / 0x3FF00000);
688				g = (abgr & 0x000FFC00) * (1.0f / 0x000FFC00);
689				r = (abgr & 0x000003FF) * (1.0f / 0x000003FF);
690			}
691			break;
692		case FORMAT_A16B16G16R16I:
693			{
694				short* abgr = (short*)element;
695
696				r = abgr[0];
697				g = abgr[1];
698				b = abgr[2];
699				a = abgr[3];
700			}
701			break;
702		case FORMAT_A16B16G16R16:
703			r = ((unsigned short*)element)[0] * (1.0f / 0xFFFF);
704			g = ((unsigned short*)element)[1] * (1.0f / 0xFFFF);
705			b = ((unsigned short*)element)[2] * (1.0f / 0xFFFF);
706			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
707			break;
708		case FORMAT_A16B16G16R16UI:
709			{
710				unsigned short* abgr = (unsigned short*)element;
711
712				r = abgr[0];
713				g = abgr[1];
714				b = abgr[2];
715				a = abgr[3];
716			}
717			break;
718		case FORMAT_X16B16G16R16I:
719			{
720				short* bgr = (short*)element;
721
722				r = bgr[0];
723				g = bgr[1];
724				b = bgr[2];
725			}
726			break;
727		case FORMAT_X16B16G16R16UI:
728			{
729				unsigned short* bgr = (unsigned short*)element;
730
731				r = bgr[0];
732				g = bgr[1];
733				b = bgr[2];
734			}
735			break;
736		case FORMAT_A32B32G32R32I:
737			{
738				int* abgr = (int*)element;
739
740				r = static_cast<float>(abgr[0]);
741				g = static_cast<float>(abgr[1]);
742				b = static_cast<float>(abgr[2]);
743				a = static_cast<float>(abgr[3]);
744			}
745			break;
746		case FORMAT_A32B32G32R32UI:
747			{
748				unsigned int* abgr = (unsigned int*)element;
749
750				r = static_cast<float>(abgr[0]);
751				g = static_cast<float>(abgr[1]);
752				b = static_cast<float>(abgr[2]);
753				a = static_cast<float>(abgr[3]);
754			}
755			break;
756		case FORMAT_X32B32G32R32I:
757			{
758				int* bgr = (int*)element;
759
760				r = static_cast<float>(bgr[0]);
761				g = static_cast<float>(bgr[1]);
762				b = static_cast<float>(bgr[2]);
763			}
764			break;
765		case FORMAT_X32B32G32R32UI:
766			{
767				unsigned int* bgr = (unsigned int*)element;
768
769				r = static_cast<float>(bgr[0]);
770				g = static_cast<float>(bgr[1]);
771				b = static_cast<float>(bgr[2]);
772			}
773			break;
774		case FORMAT_G32R32I:
775			{
776				int* gr = (int*)element;
777
778				r = static_cast<float>(gr[0]);
779				g = static_cast<float>(gr[1]);
780			}
781			break;
782		case FORMAT_G32R32UI:
783			{
784				unsigned int* gr = (unsigned int*)element;
785
786				r = static_cast<float>(gr[0]);
787				g = static_cast<float>(gr[1]);
788			}
789			break;
790		case FORMAT_R32I:
791			r = static_cast<float>(*((int*)element));
792			break;
793		case FORMAT_R32UI:
794			r = static_cast<float>(*((unsigned int*)element));
795			break;
796		case FORMAT_V8U8:
797			{
798				unsigned short vu = *(unsigned short*)element;
799
800				r = ((int)(vu & 0x00FF) << 24) * (1.0f / 0x7F000000);
801				g = ((int)(vu & 0xFF00) << 16) * (1.0f / 0x7F000000);
802			}
803			break;
804		case FORMAT_L6V5U5:
805			{
806				unsigned short lvu = *(unsigned short*)element;
807
808				r = ((int)(lvu & 0x001F) << 27) * (1.0f / 0x78000000);
809				g = ((int)(lvu & 0x03E0) << 22) * (1.0f / 0x78000000);
810				b = (lvu & 0xFC00) * (1.0f / 0xFC00);
811			}
812			break;
813		case FORMAT_Q8W8V8U8:
814			{
815				unsigned int qwvu = *(unsigned int*)element;
816
817				r = ((int)(qwvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
818				g = ((int)(qwvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
819				b = ((int)(qwvu & 0x00FF0000) << 8)  * (1.0f / 0x7F000000);
820				a = ((int)(qwvu & 0xFF000000) << 0)  * (1.0f / 0x7F000000);
821			}
822			break;
823		case FORMAT_X8L8V8U8:
824			{
825				unsigned int xlvu = *(unsigned int*)element;
826
827				r = ((int)(xlvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
828				g = ((int)(xlvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
829				b = (xlvu & 0x00FF0000) * (1.0f / 0x00FF0000);
830			}
831			break;
832		case FORMAT_R8G8B8:
833			r = ((unsigned char*)element)[2] * (1.0f / 0xFF);
834			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
835			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
836			break;
837		case FORMAT_B8G8R8:
838			r = ((unsigned char*)element)[0] * (1.0f / 0xFF);
839			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
840			b = ((unsigned char*)element)[2] * (1.0f / 0xFF);
841			break;
842		case FORMAT_V16U16:
843			{
844				unsigned int vu = *(unsigned int*)element;
845
846				r = ((int)(vu & 0x0000FFFF) << 16) * (1.0f / 0x7FFF0000);
847				g = ((int)(vu & 0xFFFF0000) << 0)  * (1.0f / 0x7FFF0000);
848			}
849			break;
850		case FORMAT_A2W10V10U10:
851			{
852				unsigned int awvu = *(unsigned int*)element;
853
854				r = ((int)(awvu & 0x000003FF) << 22) * (1.0f / 0x7FC00000);
855				g = ((int)(awvu & 0x000FFC00) << 12) * (1.0f / 0x7FC00000);
856				b = ((int)(awvu & 0x3FF00000) << 2)  * (1.0f / 0x7FC00000);
857				a = (awvu & 0xC0000000) * (1.0f / 0xC0000000);
858			}
859			break;
860		case FORMAT_A16W16V16U16:
861			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
862			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
863			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
864			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
865			break;
866		case FORMAT_Q16W16V16U16:
867			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
868			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
869			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
870			a = ((signed short*)element)[3] * (1.0f / 0x7FFF);
871			break;
872		case FORMAT_L8:
873			r =
874			g =
875			b = *(unsigned char*)element * (1.0f / 0xFF);
876			break;
877		case FORMAT_A4L4:
878			{
879				unsigned char al = *(unsigned char*)element;
880
881				r =
882				g =
883				b = (al & 0x0F) * (1.0f / 0x0F);
884				a = (al & 0xF0) * (1.0f / 0xF0);
885			}
886			break;
887		case FORMAT_L16:
888			r =
889			g =
890			b = *(unsigned short*)element * (1.0f / 0xFFFF);
891			break;
892		case FORMAT_A8L8:
893			r =
894			g =
895			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
896			a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
897			break;
898		case FORMAT_L16F:
899			r =
900			g =
901			b = *(half*)element;
902			break;
903		case FORMAT_A16L16F:
904			r =
905			g =
906			b = ((half*)element)[0];
907			a = ((half*)element)[1];
908			break;
909		case FORMAT_L32F:
910			r =
911			g =
912			b = *(float*)element;
913			break;
914		case FORMAT_A32L32F:
915			r =
916			g =
917			b = ((float*)element)[0];
918			a = ((float*)element)[1];
919			break;
920		case FORMAT_A16F:
921			a = *(half*)element;
922			break;
923		case FORMAT_R16F:
924			r = *(half*)element;
925			break;
926		case FORMAT_G16R16F:
927			r = ((half*)element)[0];
928			g = ((half*)element)[1];
929			break;
930		case FORMAT_B16G16R16F:
931			r = ((half*)element)[0];
932			g = ((half*)element)[1];
933			b = ((half*)element)[2];
934			break;
935		case FORMAT_A16B16G16R16F:
936			r = ((half*)element)[0];
937			g = ((half*)element)[1];
938			b = ((half*)element)[2];
939			a = ((half*)element)[3];
940			break;
941		case FORMAT_A32F:
942			a = *(float*)element;
943			break;
944		case FORMAT_R32F:
945			r = *(float*)element;
946			break;
947		case FORMAT_G32R32F:
948			r = ((float*)element)[0];
949			g = ((float*)element)[1];
950			break;
951		case FORMAT_X32B32G32R32F:
952		case FORMAT_B32G32R32F:
953			r = ((float*)element)[0];
954			g = ((float*)element)[1];
955			b = ((float*)element)[2];
956			break;
957		case FORMAT_A32B32G32R32F:
958			r = ((float*)element)[0];
959			g = ((float*)element)[1];
960			b = ((float*)element)[2];
961			a = ((float*)element)[3];
962			break;
963		case FORMAT_D32F:
964		case FORMAT_D32F_LOCKABLE:
965		case FORMAT_D32FS8_TEXTURE:
966		case FORMAT_D32FS8_SHADOW:
967			r = *(float*)element;
968			g = r;
969			b = r;
970			a = r;
971			break;
972		case FORMAT_D32F_COMPLEMENTARY:
973			r = 1.0f - *(float*)element;
974			g = r;
975			b = r;
976			a = r;
977			break;
978		case FORMAT_S8:
979			r = *(unsigned char*)element * (1.0f / 0xFF);
980			break;
981		default:
982			ASSERT(false);
983		}
984
985	//	if(sRGB)
986	//	{
987	//		r = sRGBtoLinear(r);
988	//		g = sRGBtoLinear(g);
989	//		b = sRGBtoLinear(b);
990	//	}
991
992		return Color<float>(r, g, b, a);
993	}
994
995	Color<float> Surface::Buffer::sample(float x, float y, float z) const
996	{
997		x -= 0.5f;
998		y -= 0.5f;
999		z -= 0.5f;
1000
1001		int x0 = clamp((int)x, 0, width - 1);
1002		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
1003
1004		int y0 = clamp((int)y, 0, height - 1);
1005		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
1006
1007		int z0 = clamp((int)z, 0, depth - 1);
1008		int z1 = (z0 + 1 >= depth) ? z0 : z0 + 1;
1009
1010		Color<float> c000 = read(x0, y0, z0);
1011		Color<float> c100 = read(x1, y0, z0);
1012		Color<float> c010 = read(x0, y1, z0);
1013		Color<float> c110 = read(x1, y1, z0);
1014		Color<float> c001 = read(x0, y0, z1);
1015		Color<float> c101 = read(x1, y0, z1);
1016		Color<float> c011 = read(x0, y1, z1);
1017		Color<float> c111 = read(x1, y1, z1);
1018
1019		float fx = x - x0;
1020		float fy = y - y0;
1021		float fz = z - z0;
1022
1023		c000 *= (1 - fx) * (1 - fy) * (1 - fz);
1024		c100 *= fx * (1 - fy) * (1 - fz);
1025		c010 *= (1 - fx) * fy * (1 - fz);
1026		c110 *= fx * fy * (1 - fz);
1027		c001 *= (1 - fx) * (1 - fy) * fz;
1028		c101 *= fx * (1 - fy) * fz;
1029		c011 *= (1 - fx) * fy * fz;
1030		c111 *= fx * fy * fz;
1031
1032		return c000 + c100 + c010 + c110 + c001 + c101 + c011 + c111;
1033	}
1034
1035	Color<float> Surface::Buffer::sample(float x, float y) const
1036	{
1037		x -= 0.5f;
1038		y -= 0.5f;
1039
1040		int x0 = clamp((int)x, 0, width - 1);
1041		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
1042
1043		int y0 = clamp((int)y, 0, height - 1);
1044		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
1045
1046		Color<float> c00 = read(x0, y0);
1047		Color<float> c10 = read(x1, y0);
1048		Color<float> c01 = read(x0, y1);
1049		Color<float> c11 = read(x1, y1);
1050
1051		float fx = x - x0;
1052		float fy = y - y0;
1053
1054		c00 *= (1 - fx) * (1 - fy);
1055		c10 *= fx * (1 - fy);
1056		c01 *= (1 - fx) * fy;
1057		c11 *= fx * fy;
1058
1059		return c00 + c10 + c01 + c11;
1060	}
1061
1062	void *Surface::Buffer::lockRect(int x, int y, int z, Lock lock)
1063	{
1064		this->lock = lock;
1065
1066		switch(lock)
1067		{
1068		case LOCK_UNLOCKED:
1069		case LOCK_READONLY:
1070			break;
1071		case LOCK_WRITEONLY:
1072		case LOCK_READWRITE:
1073		case LOCK_DISCARD:
1074			dirty = true;
1075			break;
1076		default:
1077			ASSERT(false);
1078		}
1079
1080		if(buffer)
1081		{
1082			switch(format)
1083			{
1084			#if S3TC_SUPPORT
1085			case FORMAT_DXT1:
1086			#endif
1087			case FORMAT_ATI1:
1088			case FORMAT_ETC1:
1089			case FORMAT_R11_EAC:
1090			case FORMAT_SIGNED_R11_EAC:
1091			case FORMAT_RGB8_ETC2:
1092			case FORMAT_SRGB8_ETC2:
1093			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1094			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1095				return (unsigned char*)buffer + 8 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1096			case FORMAT_RG11_EAC:
1097			case FORMAT_SIGNED_RG11_EAC:
1098			case FORMAT_RGBA8_ETC2_EAC:
1099			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1100			case FORMAT_RGBA_ASTC_4x4_KHR:
1101			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1102				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1103			case FORMAT_RGBA_ASTC_5x4_KHR:
1104			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1105				return (unsigned char*)buffer + 16 * (x / 5) + (y / 4) * pitchB + z * sliceB;
1106			case FORMAT_RGBA_ASTC_5x5_KHR:
1107			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1108				return (unsigned char*)buffer + 16 * (x / 5) + (y / 5) * pitchB + z * sliceB;
1109			case FORMAT_RGBA_ASTC_6x5_KHR:
1110			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1111				return (unsigned char*)buffer + 16 * (x / 6) + (y / 5) * pitchB + z * sliceB;
1112			case FORMAT_RGBA_ASTC_6x6_KHR:
1113			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1114				return (unsigned char*)buffer + 16 * (x / 6) + (y / 6) * pitchB + z * sliceB;
1115			case FORMAT_RGBA_ASTC_8x5_KHR:
1116			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1117				return (unsigned char*)buffer + 16 * (x / 8) + (y / 5) * pitchB + z * sliceB;
1118			case FORMAT_RGBA_ASTC_8x6_KHR:
1119			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1120				return (unsigned char*)buffer + 16 * (x / 8) + (y / 6) * pitchB + z * sliceB;
1121			case FORMAT_RGBA_ASTC_8x8_KHR:
1122			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1123				return (unsigned char*)buffer + 16 * (x / 8) + (y / 8) * pitchB + z * sliceB;
1124			case FORMAT_RGBA_ASTC_10x5_KHR:
1125			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1126				return (unsigned char*)buffer + 16 * (x / 10) + (y / 5) * pitchB + z * sliceB;
1127			case FORMAT_RGBA_ASTC_10x6_KHR:
1128			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1129				return (unsigned char*)buffer + 16 * (x / 10) + (y / 6) * pitchB + z * sliceB;
1130			case FORMAT_RGBA_ASTC_10x8_KHR:
1131			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1132				return (unsigned char*)buffer + 16 * (x / 10) + (y / 8) * pitchB + z * sliceB;
1133			case FORMAT_RGBA_ASTC_10x10_KHR:
1134			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1135				return (unsigned char*)buffer + 16 * (x / 10) + (y / 10) * pitchB + z * sliceB;
1136			case FORMAT_RGBA_ASTC_12x10_KHR:
1137			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1138				return (unsigned char*)buffer + 16 * (x / 12) + (y / 10) * pitchB + z * sliceB;
1139			case FORMAT_RGBA_ASTC_12x12_KHR:
1140			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1141				return (unsigned char*)buffer + 16 * (x / 12) + (y / 12) * pitchB + z * sliceB;
1142			#if S3TC_SUPPORT
1143			case FORMAT_DXT3:
1144			case FORMAT_DXT5:
1145			#endif
1146			case FORMAT_ATI2:
1147				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1148			default:
1149				return (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
1150			}
1151		}
1152
1153		return 0;
1154	}
1155
1156	void Surface::Buffer::unlockRect()
1157	{
1158		lock = LOCK_UNLOCKED;
1159	}
1160
1161	Surface::Surface(int width, int height, int depth, Format format, void *pixels, int pitch, int slice) : lockable(true), renderTarget(false)
1162	{
1163		resource = new Resource(0);
1164		hasParent = false;
1165		ownExternal = false;
1166		depth = max(1, depth);
1167
1168		external.buffer = pixels;
1169		external.width = width;
1170		external.height = height;
1171		external.depth = depth;
1172		external.format = format;
1173		external.bytes = bytes(external.format);
1174		external.pitchB = pitch;
1175		external.pitchP = external.bytes ? pitch / external.bytes : 0;
1176		external.sliceB = slice;
1177		external.sliceP = external.bytes ? slice / external.bytes : 0;
1178		external.lock = LOCK_UNLOCKED;
1179		external.dirty = true;
1180
1181		internal.buffer = 0;
1182		internal.width = width;
1183		internal.height = height;
1184		internal.depth = depth;
1185		internal.format = selectInternalFormat(format);
1186		internal.bytes = bytes(internal.format);
1187		internal.pitchB = pitchB(internal.width, internal.format, false);
1188		internal.pitchP = pitchP(internal.width, internal.format, false);
1189		internal.sliceB = sliceB(internal.width, internal.height, internal.format, false);
1190		internal.sliceP = sliceP(internal.width, internal.height, internal.format, false);
1191		internal.lock = LOCK_UNLOCKED;
1192		internal.dirty = false;
1193
1194		stencil.buffer = 0;
1195		stencil.width = width;
1196		stencil.height = height;
1197		stencil.depth = depth;
1198		stencil.format = FORMAT_S8;
1199		stencil.bytes = bytes(stencil.format);
1200		stencil.pitchB = pitchB(stencil.width, stencil.format, false);
1201		stencil.pitchP = pitchP(stencil.width, stencil.format, false);
1202		stencil.sliceB = sliceB(stencil.width, stencil.height, stencil.format, false);
1203		stencil.sliceP = sliceP(stencil.width, stencil.height, stencil.format, false);
1204		stencil.lock = LOCK_UNLOCKED;
1205		stencil.dirty = false;
1206
1207		dirtyMipmaps = true;
1208		paletteUsed = 0;
1209	}
1210
1211	Surface::Surface(Resource *texture, int width, int height, int depth, Format format, bool lockable, bool renderTarget, int pitchPprovided) : lockable(lockable), renderTarget(renderTarget)
1212	{
1213		resource = texture ? texture : new Resource(0);
1214		hasParent = texture != 0;
1215		ownExternal = true;
1216		depth = max(1, depth);
1217
1218		external.buffer = 0;
1219		external.width = width;
1220		external.height = height;
1221		external.depth = depth;
1222		external.format = format;
1223		external.bytes = bytes(external.format);
1224		external.pitchB = pitchB(external.width, external.format, renderTarget && !texture);
1225		external.pitchP = pitchP(external.width, external.format, renderTarget && !texture);
1226		external.sliceB = sliceB(external.width, external.height, external.format, renderTarget && !texture);
1227		external.sliceP = sliceP(external.width, external.height, external.format, renderTarget && !texture);
1228		external.lock = LOCK_UNLOCKED;
1229		external.dirty = false;
1230
1231		internal.buffer = 0;
1232		internal.width = width;
1233		internal.height = height;
1234		internal.depth = depth;
1235		internal.format = selectInternalFormat(format);
1236		internal.bytes = bytes(internal.format);
1237		internal.pitchB = !pitchPprovided ? pitchB(internal.width, internal.format, renderTarget) : pitchPprovided * internal.bytes;
1238		internal.pitchP = !pitchPprovided ? pitchP(internal.width, internal.format, renderTarget) : pitchPprovided;
1239		internal.sliceB = sliceB(internal.width, internal.height, internal.format, renderTarget);
1240		internal.sliceP = sliceP(internal.width, internal.height, internal.format, renderTarget);
1241		internal.lock = LOCK_UNLOCKED;
1242		internal.dirty = false;
1243
1244		stencil.buffer = 0;
1245		stencil.width = width;
1246		stencil.height = height;
1247		stencil.depth = depth;
1248		stencil.format = FORMAT_S8;
1249		stencil.bytes = bytes(stencil.format);
1250		stencil.pitchB = pitchB(stencil.width, stencil.format, renderTarget);
1251		stencil.pitchP = pitchP(stencil.width, stencil.format, renderTarget);
1252		stencil.sliceB = sliceB(stencil.width, stencil.height, stencil.format, renderTarget);
1253		stencil.sliceP = sliceP(stencil.width, stencil.height, stencil.format, renderTarget);
1254		stencil.lock = LOCK_UNLOCKED;
1255		stencil.dirty = false;
1256
1257		dirtyMipmaps = true;
1258		paletteUsed = 0;
1259	}
1260
1261	Surface::~Surface()
1262	{
1263		// Synchronize so we can deallocate the buffers below
1264		resource->lock(DESTRUCT);
1265		resource->unlock();
1266
1267		if(!hasParent)
1268		{
1269			resource->destruct();
1270		}
1271
1272		if(ownExternal)
1273		{
1274			deallocate(external.buffer);
1275		}
1276
1277		if(internal.buffer != external.buffer)
1278		{
1279			deallocate(internal.buffer);
1280		}
1281
1282		deallocate(stencil.buffer);
1283
1284		external.buffer = 0;
1285		internal.buffer = 0;
1286		stencil.buffer = 0;
1287	}
1288
1289	void *Surface::lockExternal(int x, int y, int z, Lock lock, Accessor client)
1290	{
1291		resource->lock(client);
1292
1293		if(!external.buffer)
1294		{
1295			if(internal.buffer && identicalFormats())
1296			{
1297				external.buffer = internal.buffer;
1298			}
1299			else
1300			{
1301				external.buffer = allocateBuffer(external.width, external.height, external.depth, external.format);
1302			}
1303		}
1304
1305		if(internal.dirty)
1306		{
1307			if(lock != LOCK_DISCARD)
1308			{
1309				update(external, internal);
1310			}
1311
1312			internal.dirty = false;
1313		}
1314
1315		switch(lock)
1316		{
1317		case LOCK_READONLY:
1318			break;
1319		case LOCK_WRITEONLY:
1320		case LOCK_READWRITE:
1321		case LOCK_DISCARD:
1322			dirtyMipmaps = true;
1323			break;
1324		default:
1325			ASSERT(false);
1326		}
1327
1328		return external.lockRect(x, y, z, lock);
1329	}
1330
1331	void Surface::unlockExternal()
1332	{
1333		resource->unlock();
1334
1335		external.unlockRect();
1336	}
1337
1338	void *Surface::lockInternal(int x, int y, int z, Lock lock, Accessor client)
1339	{
1340		if(lock != LOCK_UNLOCKED)
1341		{
1342			resource->lock(client);
1343		}
1344
1345		if(!internal.buffer)
1346		{
1347			if(external.buffer && identicalFormats())
1348			{
1349				internal.buffer = external.buffer;
1350			}
1351			else
1352			{
1353				internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.format);
1354			}
1355		}
1356
1357		// FIXME: WHQL requires conversion to lower external precision and back
1358		if(logPrecision >= WHQL)
1359		{
1360			if(internal.dirty && renderTarget && internal.format != external.format)
1361			{
1362				if(lock != LOCK_DISCARD)
1363				{
1364					switch(external.format)
1365					{
1366					case FORMAT_R3G3B2:
1367					case FORMAT_A8R3G3B2:
1368					case FORMAT_A1R5G5B5:
1369					case FORMAT_A2R10G10B10:
1370					case FORMAT_A2B10G10R10:
1371						lockExternal(0, 0, 0, LOCK_READWRITE, client);
1372						unlockExternal();
1373						break;
1374					default:
1375						// Difference passes WHQL
1376						break;
1377					}
1378				}
1379			}
1380		}
1381
1382		if(external.dirty || (isPalette(external.format) && paletteUsed != Surface::paletteID))
1383		{
1384			if(lock != LOCK_DISCARD)
1385			{
1386				update(internal, external);
1387			}
1388
1389			external.dirty = false;
1390			paletteUsed = Surface::paletteID;
1391		}
1392
1393		switch(lock)
1394		{
1395		case LOCK_UNLOCKED:
1396		case LOCK_READONLY:
1397			break;
1398		case LOCK_WRITEONLY:
1399		case LOCK_READWRITE:
1400		case LOCK_DISCARD:
1401			dirtyMipmaps = true;
1402			break;
1403		default:
1404			ASSERT(false);
1405		}
1406
1407		if(lock == LOCK_READONLY && client == PUBLIC)
1408		{
1409			resolve();
1410		}
1411
1412		return internal.lockRect(x, y, z, lock);
1413	}
1414
1415	void Surface::unlockInternal()
1416	{
1417		resource->unlock();
1418
1419		internal.unlockRect();
1420	}
1421
1422	void *Surface::lockStencil(int front, Accessor client)
1423	{
1424		resource->lock(client);
1425
1426		if(!stencil.buffer)
1427		{
1428			stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.format);
1429		}
1430
1431		return stencil.lockRect(0, 0, front, LOCK_READWRITE);   // FIXME
1432	}
1433
1434	void Surface::unlockStencil()
1435	{
1436		resource->unlock();
1437
1438		stencil.unlockRect();
1439	}
1440
1441	int Surface::bytes(Format format)
1442	{
1443		switch(format)
1444		{
1445		case FORMAT_NULL:				return 0;
1446		case FORMAT_P8:					return 1;
1447		case FORMAT_A8P8:				return 2;
1448		case FORMAT_A8:					return 1;
1449		case FORMAT_R8I:				return 1;
1450		case FORMAT_R8:					return 1;
1451		case FORMAT_R3G3B2:				return 1;
1452		case FORMAT_R16I:				return 2;
1453		case FORMAT_R16UI:				return 2;
1454		case FORMAT_A8R3G3B2:			return 2;
1455		case FORMAT_R5G6B5:				return 2;
1456		case FORMAT_A1R5G5B5:			return 2;
1457		case FORMAT_X1R5G5B5:			return 2;
1458		case FORMAT_R5G5B5A1:           return 2;
1459		case FORMAT_X4R4G4B4:			return 2;
1460		case FORMAT_A4R4G4B4:			return 2;
1461		case FORMAT_R4G4B4A4:           return 2;
1462		case FORMAT_R8G8B8:				return 3;
1463		case FORMAT_B8G8R8:             return 3;
1464		case FORMAT_R32I:				return 4;
1465		case FORMAT_R32UI:				return 4;
1466		case FORMAT_X8R8G8B8:			return 4;
1467	//	case FORMAT_X8G8R8B8Q:			return 4;
1468		case FORMAT_A8R8G8B8:			return 4;
1469	//	case FORMAT_A8G8R8B8Q:			return 4;
1470		case FORMAT_X8B8G8R8I:			return 4;
1471		case FORMAT_X8B8G8R8:			return 4;
1472		case FORMAT_A8B8G8R8I:			return 4;
1473		case FORMAT_R8UI:				return 1;
1474		case FORMAT_G8R8UI:				return 2;
1475		case FORMAT_X8B8G8R8UI:			return 4;
1476		case FORMAT_A8B8G8R8UI:			return 4;
1477		case FORMAT_A8B8G8R8:			return 4;
1478		case FORMAT_R8I_SNORM:			return 1;
1479		case FORMAT_G8R8I_SNORM:		return 2;
1480		case FORMAT_X8B8G8R8I_SNORM:	return 4;
1481		case FORMAT_A8B8G8R8I_SNORM:	return 4;
1482		case FORMAT_A2R10G10B10:		return 4;
1483		case FORMAT_A2B10G10R10:		return 4;
1484		case FORMAT_G8R8I:				return 2;
1485		case FORMAT_G8R8:				return 2;
1486		case FORMAT_G16R16I:			return 4;
1487		case FORMAT_G16R16UI:			return 4;
1488		case FORMAT_G16R16:				return 4;
1489		case FORMAT_G32R32I:			return 8;
1490		case FORMAT_G32R32UI:			return 8;
1491		case FORMAT_X16B16G16R16I:		return 8;
1492		case FORMAT_X16B16G16R16UI:		return 8;
1493		case FORMAT_A16B16G16R16I:		return 8;
1494		case FORMAT_A16B16G16R16UI:		return 8;
1495		case FORMAT_A16B16G16R16:		return 8;
1496		case FORMAT_X32B32G32R32I:		return 16;
1497		case FORMAT_X32B32G32R32UI:		return 16;
1498		case FORMAT_A32B32G32R32I:		return 16;
1499		case FORMAT_A32B32G32R32UI:		return 16;
1500		// Compressed formats
1501		#if S3TC_SUPPORT
1502		case FORMAT_DXT1:				return 2;   // Column of four pixels
1503		case FORMAT_DXT3:				return 4;   // Column of four pixels
1504		case FORMAT_DXT5:				return 4;   // Column of four pixels
1505		#endif
1506		case FORMAT_ATI1:				return 2;   // Column of four pixels
1507		case FORMAT_ATI2:				return 4;   // Column of four pixels
1508		case FORMAT_ETC1:				return 2;   // Column of four pixels
1509		case FORMAT_R11_EAC:			return 2;
1510		case FORMAT_SIGNED_R11_EAC:		return 2;
1511		case FORMAT_RG11_EAC:			return 4;
1512		case FORMAT_SIGNED_RG11_EAC:	return 4;
1513		case FORMAT_RGB8_ETC2:			return 2;
1514		case FORMAT_SRGB8_ETC2:			return 2;
1515		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
1516		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
1517		case FORMAT_RGBA8_ETC2_EAC:			return 4;
1518		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:	return 4;
1519		case FORMAT_RGBA_ASTC_4x4_KHR:
1520		case FORMAT_RGBA_ASTC_5x4_KHR:
1521		case FORMAT_RGBA_ASTC_5x5_KHR:
1522		case FORMAT_RGBA_ASTC_6x5_KHR:
1523		case FORMAT_RGBA_ASTC_6x6_KHR:
1524		case FORMAT_RGBA_ASTC_8x5_KHR:
1525		case FORMAT_RGBA_ASTC_8x6_KHR:
1526		case FORMAT_RGBA_ASTC_8x8_KHR:
1527		case FORMAT_RGBA_ASTC_10x5_KHR:
1528		case FORMAT_RGBA_ASTC_10x6_KHR:
1529		case FORMAT_RGBA_ASTC_10x8_KHR:
1530		case FORMAT_RGBA_ASTC_10x10_KHR:
1531		case FORMAT_RGBA_ASTC_12x10_KHR:
1532		case FORMAT_RGBA_ASTC_12x12_KHR:
1533		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1534		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1535		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1536		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1537		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1538		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1539		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1540		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1541		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1542		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1543		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1544		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1545		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1546		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: return 0; // FIXME
1547		// Bumpmap formats
1548		case FORMAT_V8U8:				return 2;
1549		case FORMAT_L6V5U5:				return 2;
1550		case FORMAT_Q8W8V8U8:			return 4;
1551		case FORMAT_X8L8V8U8:			return 4;
1552		case FORMAT_A2W10V10U10:		return 4;
1553		case FORMAT_V16U16:				return 4;
1554		case FORMAT_A16W16V16U16:		return 8;
1555		case FORMAT_Q16W16V16U16:		return 8;
1556		// Luminance formats
1557		case FORMAT_L8:					return 1;
1558		case FORMAT_A4L4:				return 1;
1559		case FORMAT_L16:				return 2;
1560		case FORMAT_A8L8:				return 2;
1561		case FORMAT_L16F:               return 2;
1562		case FORMAT_A16L16F:            return 4;
1563		case FORMAT_L32F:               return 4;
1564		case FORMAT_A32L32F:            return 8;
1565		// Floating-point formats
1566		case FORMAT_A16F:				return 2;
1567		case FORMAT_R16F:				return 2;
1568		case FORMAT_G16R16F:			return 4;
1569		case FORMAT_B16G16R16F:			return 6;
1570		case FORMAT_A16B16G16R16F:		return 8;
1571		case FORMAT_A32F:				return 4;
1572		case FORMAT_R32F:				return 4;
1573		case FORMAT_G32R32F:			return 8;
1574		case FORMAT_B32G32R32F:			return 12;
1575		case FORMAT_X32B32G32R32F:		return 16;
1576		case FORMAT_A32B32G32R32F:		return 16;
1577		// Depth/stencil formats
1578		case FORMAT_D16:				return 2;
1579		case FORMAT_D32:				return 4;
1580		case FORMAT_D24X8:				return 4;
1581		case FORMAT_D24S8:				return 4;
1582		case FORMAT_D24FS8:				return 4;
1583		case FORMAT_D32F:				return 4;
1584		case FORMAT_D32F_COMPLEMENTARY:	return 4;
1585		case FORMAT_D32F_LOCKABLE:		return 4;
1586		case FORMAT_D32FS8_TEXTURE:		return 4;
1587		case FORMAT_D32FS8_SHADOW:		return 4;
1588		case FORMAT_DF24S8:				return 4;
1589		case FORMAT_DF16S8:				return 2;
1590		case FORMAT_INTZ:				return 4;
1591		case FORMAT_S8:					return 1;
1592		case FORMAT_YV12_BT601:         return 1;   // Y plane only
1593		case FORMAT_YV12_BT709:         return 1;   // Y plane only
1594		case FORMAT_YV12_JFIF:          return 1;   // Y plane only
1595		default:
1596			ASSERT(false);
1597		}
1598
1599		return 0;
1600	}
1601
1602	int Surface::pitchB(int width, Format format, bool target)
1603	{
1604		if(target || isDepth(format) || isStencil(format))
1605		{
1606			width = align(width, 2);
1607		}
1608
1609		switch(format)
1610		{
1611		#if S3TC_SUPPORT
1612		case FORMAT_DXT1:
1613		#endif
1614		case FORMAT_ETC1:
1615		case FORMAT_R11_EAC:
1616		case FORMAT_SIGNED_R11_EAC:
1617		case FORMAT_RGB8_ETC2:
1618		case FORMAT_SRGB8_ETC2:
1619		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1620		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1621			return 8 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per 4 rows
1622		case FORMAT_RG11_EAC:
1623		case FORMAT_SIGNED_RG11_EAC:
1624		case FORMAT_RGBA8_ETC2_EAC:
1625		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1626		case FORMAT_RGBA_ASTC_4x4_KHR:
1627		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1628			return 16 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per 4 rows
1629		case FORMAT_RGBA_ASTC_5x4_KHR:
1630		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1631		case FORMAT_RGBA_ASTC_5x5_KHR:
1632		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1633			return 16 * ((width + 4) / 5);
1634		case FORMAT_RGBA_ASTC_6x5_KHR:
1635		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1636		case FORMAT_RGBA_ASTC_6x6_KHR:
1637		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1638			return 16 * ((width + 5) / 6);
1639		case FORMAT_RGBA_ASTC_8x5_KHR:
1640		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1641		case FORMAT_RGBA_ASTC_8x6_KHR:
1642		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1643		case FORMAT_RGBA_ASTC_8x8_KHR:
1644		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1645			return 16 * ((width + 7) / 8);
1646		case FORMAT_RGBA_ASTC_10x5_KHR:
1647		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1648		case FORMAT_RGBA_ASTC_10x6_KHR:
1649		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1650		case FORMAT_RGBA_ASTC_10x8_KHR:
1651		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1652		case FORMAT_RGBA_ASTC_10x10_KHR:
1653		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1654			return 16 * ((width + 9) / 10);
1655		case FORMAT_RGBA_ASTC_12x10_KHR:
1656		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1657		case FORMAT_RGBA_ASTC_12x12_KHR:
1658		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1659			return 16 * ((width + 11) / 12);
1660		#if S3TC_SUPPORT
1661		case FORMAT_DXT3:
1662		case FORMAT_DXT5:
1663			return 16 * ((width + 3) / 4);   // 128 bit per 4x4 block, computed per 4 rows
1664		#endif
1665		case FORMAT_ATI1:
1666			return 2 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per row
1667		case FORMAT_ATI2:
1668			return 4 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per row
1669		case FORMAT_YV12_BT601:
1670		case FORMAT_YV12_BT709:
1671		case FORMAT_YV12_JFIF:
1672			return align(width, 16);
1673		default:
1674			return bytes(format) * width;
1675		}
1676	}
1677
1678	int Surface::pitchP(int width, Format format, bool target)
1679	{
1680		int B = bytes(format);
1681
1682		return B > 0 ? pitchB(width, format, target) / B : 0;
1683	}
1684
1685	int Surface::sliceB(int width, int height, Format format, bool target)
1686	{
1687		if(target || isDepth(format) || isStencil(format))
1688		{
1689			height = ((height + 1) & ~1);
1690		}
1691
1692		switch(format)
1693		{
1694		#if S3TC_SUPPORT
1695		case FORMAT_DXT1:
1696		case FORMAT_DXT3:
1697		case FORMAT_DXT5:
1698		#endif
1699		case FORMAT_ETC1:
1700		case FORMAT_R11_EAC:
1701		case FORMAT_SIGNED_R11_EAC:
1702		case FORMAT_RG11_EAC:
1703		case FORMAT_SIGNED_RG11_EAC:
1704		case FORMAT_RGB8_ETC2:
1705		case FORMAT_SRGB8_ETC2:
1706		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1707		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1708		case FORMAT_RGBA8_ETC2_EAC:
1709		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1710		case FORMAT_RGBA_ASTC_4x4_KHR:
1711		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1712		case FORMAT_RGBA_ASTC_5x4_KHR:
1713		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1714			return pitchB(width, format, target) * ((height + 3) / 4);   // Pitch computed per 4 rows
1715		case FORMAT_RGBA_ASTC_5x5_KHR:
1716		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1717		case FORMAT_RGBA_ASTC_6x5_KHR:
1718		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1719		case FORMAT_RGBA_ASTC_8x5_KHR:
1720		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1721		case FORMAT_RGBA_ASTC_10x5_KHR:
1722		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1723			return pitchB(width, format, target) * ((height + 4) / 5);   // Pitch computed per 5 rows
1724		case FORMAT_RGBA_ASTC_6x6_KHR:
1725		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1726		case FORMAT_RGBA_ASTC_8x6_KHR:
1727		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1728		case FORMAT_RGBA_ASTC_10x6_KHR:
1729		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1730			return pitchB(width, format, target) * ((height + 5) / 6);   // Pitch computed per 6 rows
1731		case FORMAT_RGBA_ASTC_8x8_KHR:
1732		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1733		case FORMAT_RGBA_ASTC_10x8_KHR:
1734		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1735			return pitchB(width, format, target) * ((height + 7) / 8);   // Pitch computed per 8 rows
1736		case FORMAT_RGBA_ASTC_10x10_KHR:
1737		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1738		case FORMAT_RGBA_ASTC_12x10_KHR:
1739		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1740			return pitchB(width, format, target) * ((height + 9) / 10);   // Pitch computed per 10 rows
1741		case FORMAT_RGBA_ASTC_12x12_KHR:
1742		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1743			return pitchB(width, format, target) * ((height + 11) / 12);   // Pitch computed per 12 rows
1744		case FORMAT_ATI1:
1745		case FORMAT_ATI2:
1746		default:
1747			return pitchB(width, format, target) * height;   // Pitch computed per row
1748		}
1749	}
1750
1751	int Surface::sliceP(int width, int height, Format format, bool target)
1752	{
1753		int B = bytes(format);
1754
1755		return B > 0 ? sliceB(width, height, format, target) / B : 0;
1756	}
1757
1758	void Surface::update(Buffer &destination, Buffer &source)
1759	{
1760	//	ASSERT(source.lock != LOCK_UNLOCKED);
1761	//	ASSERT(destination.lock != LOCK_UNLOCKED);
1762
1763		if(destination.buffer != source.buffer)
1764		{
1765			ASSERT(source.dirty && !destination.dirty);
1766
1767			switch(source.format)
1768			{
1769			case FORMAT_R8G8B8:		decodeR8G8B8(destination, source);		break;   // FIXME: Check destination format
1770			case FORMAT_X1R5G5B5:	decodeX1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1771			case FORMAT_A1R5G5B5:	decodeA1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1772			case FORMAT_X4R4G4B4:	decodeX4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1773			case FORMAT_A4R4G4B4:	decodeA4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1774			case FORMAT_P8:			decodeP8(destination, source);			break;   // FIXME: Check destination format
1775			#if S3TC_SUPPORT
1776			case FORMAT_DXT1:		decodeDXT1(destination, source);		break;   // FIXME: Check destination format
1777			case FORMAT_DXT3:		decodeDXT3(destination, source);		break;   // FIXME: Check destination format
1778			case FORMAT_DXT5:		decodeDXT5(destination, source);		break;   // FIXME: Check destination format
1779			#endif
1780			case FORMAT_ATI1:		decodeATI1(destination, source);		break;   // FIXME: Check destination format
1781			case FORMAT_ATI2:		decodeATI2(destination, source);		break;   // FIXME: Check destination format
1782			case FORMAT_R11_EAC:         decodeEAC(destination, source, 1, false); break; // FIXME: Check destination format
1783			case FORMAT_SIGNED_R11_EAC:  decodeEAC(destination, source, 1, true);  break; // FIXME: Check destination format
1784			case FORMAT_RG11_EAC:        decodeEAC(destination, source, 2, false); break; // FIXME: Check destination format
1785			case FORMAT_SIGNED_RG11_EAC: decodeEAC(destination, source, 2, true);  break; // FIXME: Check destination format
1786			case FORMAT_ETC1:
1787			case FORMAT_RGB8_ETC2:                      decodeETC2(destination, source, 0, false); break; // FIXME: Check destination format
1788			case FORMAT_SRGB8_ETC2:                     decodeETC2(destination, source, 0, true);  break; // FIXME: Check destination format
1789			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:  decodeETC2(destination, source, 1, false); break; // FIXME: Check destination format
1790			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2: decodeETC2(destination, source, 1, true);  break; // FIXME: Check destination format
1791			case FORMAT_RGBA8_ETC2_EAC:                 decodeETC2(destination, source, 8, false); break; // FIXME: Check destination format
1792			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:          decodeETC2(destination, source, 8, true);  break; // FIXME: Check destination format
1793			case FORMAT_RGBA_ASTC_4x4_KHR:           decodeASTC(destination, source, 4,  4,  1, false); break; // FIXME: Check destination format
1794			case FORMAT_RGBA_ASTC_5x4_KHR:           decodeASTC(destination, source, 5,  4,  1, false); break; // FIXME: Check destination format
1795			case FORMAT_RGBA_ASTC_5x5_KHR:           decodeASTC(destination, source, 5,  5,  1, false); break; // FIXME: Check destination format
1796			case FORMAT_RGBA_ASTC_6x5_KHR:           decodeASTC(destination, source, 6,  5,  1, false); break; // FIXME: Check destination format
1797			case FORMAT_RGBA_ASTC_6x6_KHR:           decodeASTC(destination, source, 6,  6,  1, false); break; // FIXME: Check destination format
1798			case FORMAT_RGBA_ASTC_8x5_KHR:           decodeASTC(destination, source, 8,  5,  1, false); break; // FIXME: Check destination format
1799			case FORMAT_RGBA_ASTC_8x6_KHR:           decodeASTC(destination, source, 8,  6,  1, false); break; // FIXME: Check destination format
1800			case FORMAT_RGBA_ASTC_8x8_KHR:           decodeASTC(destination, source, 8,  8,  1, false); break; // FIXME: Check destination format
1801			case FORMAT_RGBA_ASTC_10x5_KHR:          decodeASTC(destination, source, 10, 5,  1, false); break; // FIXME: Check destination format
1802			case FORMAT_RGBA_ASTC_10x6_KHR:          decodeASTC(destination, source, 10, 6,  1, false); break; // FIXME: Check destination format
1803			case FORMAT_RGBA_ASTC_10x8_KHR:          decodeASTC(destination, source, 10, 8,  1, false); break; // FIXME: Check destination format
1804			case FORMAT_RGBA_ASTC_10x10_KHR:         decodeASTC(destination, source, 10, 10, 1, false); break; // FIXME: Check destination format
1805			case FORMAT_RGBA_ASTC_12x10_KHR:         decodeASTC(destination, source, 12, 10, 1, false); break; // FIXME: Check destination format
1806			case FORMAT_RGBA_ASTC_12x12_KHR:         decodeASTC(destination, source, 12, 12, 1, false); break; // FIXME: Check destination format
1807			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:   decodeASTC(destination, source, 4,  4,  1, true);  break; // FIXME: Check destination format
1808			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:   decodeASTC(destination, source, 5,  4,  1, true);  break; // FIXME: Check destination format
1809			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:   decodeASTC(destination, source, 5,  5,  1, true);  break; // FIXME: Check destination format
1810			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:   decodeASTC(destination, source, 6,  5,  1, true);  break; // FIXME: Check destination format
1811			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:   decodeASTC(destination, source, 6,  6,  1, true);  break; // FIXME: Check destination format
1812			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:   decodeASTC(destination, source, 8,  5,  1, true);  break; // FIXME: Check destination format
1813			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:   decodeASTC(destination, source, 8,  6,  1, true);  break; // FIXME: Check destination format
1814			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:   decodeASTC(destination, source, 8,  8,  1, true);  break; // FIXME: Check destination format
1815			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:  decodeASTC(destination, source, 10, 5,  1, true);  break; // FIXME: Check destination format
1816			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:  decodeASTC(destination, source, 10, 6,  1, true);  break; // FIXME: Check destination format
1817			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:  decodeASTC(destination, source, 10, 8,  1, true);  break; // FIXME: Check destination format
1818			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR: decodeASTC(destination, source, 10, 10, 1, true);  break; // FIXME: Check destination format
1819			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR: decodeASTC(destination, source, 12, 10, 1, true);  break; // FIXME: Check destination format
1820			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: decodeASTC(destination, source, 12, 12, 1, true);  break; // FIXME: Check destination format
1821			default:				genericUpdate(destination, source);		break;
1822			}
1823		}
1824	}
1825
1826	void Surface::genericUpdate(Buffer &destination, Buffer &source)
1827	{
1828		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1829		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1830
1831		int depth = min(destination.depth, source.depth);
1832		int height = min(destination.height, source.height);
1833		int width = min(destination.width, source.width);
1834		int rowBytes = width * source.bytes;
1835
1836		for(int z = 0; z < depth; z++)
1837		{
1838			unsigned char *sourceRow = sourceSlice;
1839			unsigned char *destinationRow = destinationSlice;
1840
1841			for(int y = 0; y < height; y++)
1842			{
1843				if(source.format == destination.format)
1844				{
1845					memcpy(destinationRow, sourceRow, rowBytes);
1846				}
1847				else
1848				{
1849					unsigned char *sourceElement = sourceRow;
1850					unsigned char *destinationElement = destinationRow;
1851
1852					for(int x = 0; x < width; x++)
1853					{
1854						Color<float> color = source.read(sourceElement);
1855						destination.write(destinationElement, color);
1856
1857						sourceElement += source.bytes;
1858						destinationElement += destination.bytes;
1859					}
1860				}
1861
1862				sourceRow += source.pitchB;
1863				destinationRow += destination.pitchB;
1864			}
1865
1866			sourceSlice += source.sliceB;
1867			destinationSlice += destination.sliceB;
1868		}
1869	}
1870
1871	void Surface::decodeR8G8B8(Buffer &destination, const Buffer &source)
1872	{
1873		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1874		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1875
1876		for(int z = 0; z < destination.depth && z < source.depth; z++)
1877		{
1878			unsigned char *sourceRow = sourceSlice;
1879			unsigned char *destinationRow = destinationSlice;
1880
1881			for(int y = 0; y < destination.height && y < source.height; y++)
1882			{
1883				unsigned char *sourceElement = sourceRow;
1884				unsigned char *destinationElement = destinationRow;
1885
1886				for(int x = 0; x < destination.width && x < source.width; x++)
1887				{
1888					unsigned int b = sourceElement[0];
1889					unsigned int g = sourceElement[1];
1890					unsigned int r = sourceElement[2];
1891
1892					*(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0);
1893
1894					sourceElement += source.bytes;
1895					destinationElement += destination.bytes;
1896				}
1897
1898				sourceRow += source.pitchB;
1899				destinationRow += destination.pitchB;
1900			}
1901
1902			sourceSlice += source.sliceB;
1903			destinationSlice += destination.sliceB;
1904		}
1905	}
1906
1907	void Surface::decodeX1R5G5B5(Buffer &destination, const Buffer &source)
1908	{
1909		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1910		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1911
1912		for(int z = 0; z < destination.depth && z < source.depth; z++)
1913		{
1914			unsigned char *sourceRow = sourceSlice;
1915			unsigned char *destinationRow = destinationSlice;
1916
1917			for(int y = 0; y < destination.height && y < source.height; y++)
1918			{
1919				unsigned char *sourceElement = sourceRow;
1920				unsigned char *destinationElement = destinationRow;
1921
1922				for(int x = 0; x < destination.width && x < source.width; x++)
1923				{
1924					unsigned int xrgb = *(unsigned short*)sourceElement;
1925
1926					unsigned int r = (((xrgb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
1927					unsigned int g = (((xrgb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
1928					unsigned int b = (((xrgb & 0x001F) * 2106  + 0x80) >> 8);
1929
1930					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
1931
1932					sourceElement += source.bytes;
1933					destinationElement += destination.bytes;
1934				}
1935
1936				sourceRow += source.pitchB;
1937				destinationRow += destination.pitchB;
1938			}
1939
1940			sourceSlice += source.sliceB;
1941			destinationSlice += destination.sliceB;
1942		}
1943	}
1944
1945	void Surface::decodeA1R5G5B5(Buffer &destination, const Buffer &source)
1946	{
1947		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1948		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1949
1950		for(int z = 0; z < destination.depth && z < source.depth; z++)
1951		{
1952			unsigned char *sourceRow = sourceSlice;
1953			unsigned char *destinationRow = destinationSlice;
1954
1955			for(int y = 0; y < destination.height && y < source.height; y++)
1956			{
1957				unsigned char *sourceElement = sourceRow;
1958				unsigned char *destinationElement = destinationRow;
1959
1960				for(int x = 0; x < destination.width && x < source.width; x++)
1961				{
1962					unsigned int argb = *(unsigned short*)sourceElement;
1963
1964					unsigned int a =   (argb & 0x8000) * 130560;
1965					unsigned int r = (((argb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
1966					unsigned int g = (((argb & 0x03E0) * 16846  + 0x8000) >> 8) & 0x0000FF00;
1967					unsigned int b = (((argb & 0x001F) * 2106   + 0x80) >> 8);
1968
1969					*(unsigned int*)destinationElement = a | r | g | b;
1970
1971					sourceElement += source.bytes;
1972					destinationElement += destination.bytes;
1973				}
1974
1975				sourceRow += source.pitchB;
1976				destinationRow += destination.pitchB;
1977			}
1978
1979			sourceSlice += source.sliceB;
1980			destinationSlice += destination.sliceB;
1981		}
1982	}
1983
1984	void Surface::decodeX4R4G4B4(Buffer &destination, const Buffer &source)
1985	{
1986		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1987		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1988
1989		for(int z = 0; z < destination.depth && z < source.depth; z++)
1990		{
1991			unsigned char *sourceRow = sourceSlice;
1992			unsigned char *destinationRow = destinationSlice;
1993
1994			for(int y = 0; y < destination.height && y < source.height; y++)
1995			{
1996				unsigned char *sourceElement = sourceRow;
1997				unsigned char *destinationElement = destinationRow;
1998
1999				for(int x = 0; x < destination.width && x < source.width; x++)
2000				{
2001					unsigned int xrgb = *(unsigned short*)sourceElement;
2002
2003					unsigned int r = ((xrgb & 0x0F00) * 0x00001100) & 0x00FF0000;
2004					unsigned int g = ((xrgb & 0x00F0) * 0x00000110) & 0x0000FF00;
2005					unsigned int b =  (xrgb & 0x000F) * 0x00000011;
2006
2007					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
2008
2009					sourceElement += source.bytes;
2010					destinationElement += destination.bytes;
2011				}
2012
2013				sourceRow += source.pitchB;
2014				destinationRow += destination.pitchB;
2015			}
2016
2017			sourceSlice += source.sliceB;
2018			destinationSlice += destination.sliceB;
2019		}
2020	}
2021
2022	void Surface::decodeA4R4G4B4(Buffer &destination, const Buffer &source)
2023	{
2024		unsigned char *sourceSlice = (unsigned char*)source.buffer;
2025		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
2026
2027		for(int z = 0; z < destination.depth && z < source.depth; z++)
2028		{
2029			unsigned char *sourceRow = sourceSlice;
2030			unsigned char *destinationRow = destinationSlice;
2031
2032			for(int y = 0; y < destination.height && y < source.height; y++)
2033			{
2034				unsigned char *sourceElement = sourceRow;
2035				unsigned char *destinationElement = destinationRow;
2036
2037				for(int x = 0; x < destination.width && x < source.width; x++)
2038				{
2039					unsigned int argb = *(unsigned short*)sourceElement;
2040
2041					unsigned int a = ((argb & 0xF000) * 0x00011000) & 0xFF000000;
2042					unsigned int r = ((argb & 0x0F00) * 0x00001100) & 0x00FF0000;
2043					unsigned int g = ((argb & 0x00F0) * 0x00000110) & 0x0000FF00;
2044					unsigned int b =  (argb & 0x000F) * 0x00000011;
2045
2046					*(unsigned int*)destinationElement = a | r | g | b;
2047
2048					sourceElement += source.bytes;
2049					destinationElement += destination.bytes;
2050				}
2051
2052				sourceRow += source.pitchB;
2053				destinationRow += destination.pitchB;
2054			}
2055
2056			sourceSlice += source.sliceB;
2057			destinationSlice += destination.sliceB;
2058		}
2059	}
2060
2061	void Surface::decodeP8(Buffer &destination, const Buffer &source)
2062	{
2063		unsigned char *sourceSlice = (unsigned char*)source.buffer;
2064		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
2065
2066		for(int z = 0; z < destination.depth && z < source.depth; z++)
2067		{
2068			unsigned char *sourceRow = sourceSlice;
2069			unsigned char *destinationRow = destinationSlice;
2070
2071			for(int y = 0; y < destination.height && y < source.height; y++)
2072			{
2073				unsigned char *sourceElement = sourceRow;
2074				unsigned char *destinationElement = destinationRow;
2075
2076				for(int x = 0; x < destination.width && x < source.width; x++)
2077				{
2078					unsigned int abgr = palette[*(unsigned char*)sourceElement];
2079
2080					unsigned int r = (abgr & 0x000000FF) << 16;
2081					unsigned int g = (abgr & 0x0000FF00) << 0;
2082					unsigned int b = (abgr & 0x00FF0000) >> 16;
2083					unsigned int a = (abgr & 0xFF000000) >> 0;
2084
2085					*(unsigned int*)destinationElement = a | r | g | b;
2086
2087					sourceElement += source.bytes;
2088					destinationElement += destination.bytes;
2089				}
2090
2091				sourceRow += source.pitchB;
2092				destinationRow += destination.pitchB;
2093			}
2094
2095			sourceSlice += source.sliceB;
2096			destinationSlice += destination.sliceB;
2097		}
2098	}
2099
2100#if S3TC_SUPPORT
2101	void Surface::decodeDXT1(Buffer &internal, const Buffer &external)
2102	{
2103		unsigned int *destSlice = (unsigned int*)internal.buffer;
2104		const DXT1 *source = (const DXT1*)external.buffer;
2105
2106		for(int z = 0; z < external.depth; z++)
2107		{
2108			unsigned int *dest = destSlice;
2109
2110			for(int y = 0; y < external.height; y += 4)
2111			{
2112				for(int x = 0; x < external.width; x += 4)
2113				{
2114					Color<byte> c[4];
2115
2116					c[0] = source->c0;
2117					c[1] = source->c1;
2118
2119					if(source->c0 > source->c1)   // No transparency
2120					{
2121						// c2 = 2 / 3 * c0 + 1 / 3 * c1
2122						c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2123						c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2124						c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2125						c[2].a = 0xFF;
2126
2127						// c3 = 1 / 3 * c0 + 2 / 3 * c1
2128						c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2129						c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2130						c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2131						c[3].a = 0xFF;
2132					}
2133					else   // c3 transparent
2134					{
2135						// c2 = 1 / 2 * c0 + 1 / 2 * c1
2136						c[2].r = (byte)(((word)c[0].r + (word)c[1].r) / 2);
2137						c[2].g = (byte)(((word)c[0].g + (word)c[1].g) / 2);
2138						c[2].b = (byte)(((word)c[0].b + (word)c[1].b) / 2);
2139						c[2].a = 0xFF;
2140
2141						c[3].r = 0;
2142						c[3].g = 0;
2143						c[3].b = 0;
2144						c[3].a = 0;
2145					}
2146
2147					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2148					{
2149						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2150						{
2151							dest[(x + i) + (y + j) * internal.width] = c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4];
2152						}
2153					}
2154
2155					source++;
2156				}
2157			}
2158
2159			(byte*&)destSlice += internal.sliceB;
2160		}
2161	}
2162
2163	void Surface::decodeDXT3(Buffer &internal, const Buffer &external)
2164	{
2165		unsigned int *destSlice = (unsigned int*)internal.buffer;
2166		const DXT3 *source = (const DXT3*)external.buffer;
2167
2168		for(int z = 0; z < external.depth; z++)
2169		{
2170			unsigned int *dest = destSlice;
2171
2172			for(int y = 0; y < external.height; y += 4)
2173			{
2174				for(int x = 0; x < external.width; x += 4)
2175				{
2176					Color<byte> c[4];
2177
2178					c[0] = source->c0;
2179					c[1] = source->c1;
2180
2181					// c2 = 2 / 3 * c0 + 1 / 3 * c1
2182					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2183					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2184					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2185
2186					// c3 = 1 / 3 * c0 + 2 / 3 * c1
2187					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2188					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2189					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2190
2191					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2192					{
2193						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2194						{
2195							unsigned int a = (unsigned int)(source->a >> 4 * (i + j * 4)) & 0x0F;
2196							unsigned int color = (c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | ((a << 28) + (a << 24));
2197
2198							dest[(x + i) + (y + j) * internal.width] = color;
2199						}
2200					}
2201
2202					source++;
2203				}
2204			}
2205
2206			(byte*&)destSlice += internal.sliceB;
2207		}
2208	}
2209
2210	void Surface::decodeDXT5(Buffer &internal, const Buffer &external)
2211	{
2212		unsigned int *destSlice = (unsigned int*)internal.buffer;
2213		const DXT5 *source = (const DXT5*)external.buffer;
2214
2215		for(int z = 0; z < external.depth; z++)
2216		{
2217			unsigned int *dest = destSlice;
2218
2219			for(int y = 0; y < external.height; y += 4)
2220			{
2221				for(int x = 0; x < external.width; x += 4)
2222				{
2223					Color<byte> c[4];
2224
2225					c[0] = source->c0;
2226					c[1] = source->c1;
2227
2228					// c2 = 2 / 3 * c0 + 1 / 3 * c1
2229					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2230					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2231					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2232
2233					// c3 = 1 / 3 * c0 + 2 / 3 * c1
2234					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2235					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2236					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2237
2238					byte a[8];
2239
2240					a[0] = source->a0;
2241					a[1] = source->a1;
2242
2243					if(a[0] > a[1])
2244					{
2245						a[2] = (byte)((6 * (word)a[0] + 1 * (word)a[1] + 3) / 7);
2246						a[3] = (byte)((5 * (word)a[0] + 2 * (word)a[1] + 3) / 7);
2247						a[4] = (byte)((4 * (word)a[0] + 3 * (word)a[1] + 3) / 7);
2248						a[5] = (byte)((3 * (word)a[0] + 4 * (word)a[1] + 3) / 7);
2249						a[6] = (byte)((2 * (word)a[0] + 5 * (word)a[1] + 3) / 7);
2250						a[7] = (byte)((1 * (word)a[0] + 6 * (word)a[1] + 3) / 7);
2251					}
2252					else
2253					{
2254						a[2] = (byte)((4 * (word)a[0] + 1 * (word)a[1] + 2) / 5);
2255						a[3] = (byte)((3 * (word)a[0] + 2 * (word)a[1] + 2) / 5);
2256						a[4] = (byte)((2 * (word)a[0] + 3 * (word)a[1] + 2) / 5);
2257						a[5] = (byte)((1 * (word)a[0] + 4 * (word)a[1] + 2) / 5);
2258						a[6] = 0;
2259						a[7] = 0xFF;
2260					}
2261
2262					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2263					{
2264						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2265						{
2266							unsigned int alpha = (unsigned int)a[(unsigned int)(source->alut >> (16 + 3 * (i + j * 4))) % 8] << 24;
2267							unsigned int color = (c[(source->clut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | alpha;
2268
2269							dest[(x + i) + (y + j) * internal.width] = color;
2270						}
2271					}
2272
2273					source++;
2274				}
2275			}
2276
2277			(byte*&)destSlice += internal.sliceB;
2278		}
2279	}
2280#endif
2281
2282	void Surface::decodeATI1(Buffer &internal, const Buffer &external)
2283	{
2284		byte *destSlice = (byte*)internal.buffer;
2285		const ATI1 *source = (const ATI1*)external.buffer;
2286
2287		for(int z = 0; z < external.depth; z++)
2288		{
2289			byte *dest = destSlice;
2290
2291			for(int y = 0; y < external.height; y += 4)
2292			{
2293				for(int x = 0; x < external.width; x += 4)
2294				{
2295					byte r[8];
2296
2297					r[0] = source->r0;
2298					r[1] = source->r1;
2299
2300					if(r[0] > r[1])
2301					{
2302						r[2] = (byte)((6 * (word)r[0] + 1 * (word)r[1] + 3) / 7);
2303						r[3] = (byte)((5 * (word)r[0] + 2 * (word)r[1] + 3) / 7);
2304						r[4] = (byte)((4 * (word)r[0] + 3 * (word)r[1] + 3) / 7);
2305						r[5] = (byte)((3 * (word)r[0] + 4 * (word)r[1] + 3) / 7);
2306						r[6] = (byte)((2 * (word)r[0] + 5 * (word)r[1] + 3) / 7);
2307						r[7] = (byte)((1 * (word)r[0] + 6 * (word)r[1] + 3) / 7);
2308					}
2309					else
2310					{
2311						r[2] = (byte)((4 * (word)r[0] + 1 * (word)r[1] + 2) / 5);
2312						r[3] = (byte)((3 * (word)r[0] + 2 * (word)r[1] + 2) / 5);
2313						r[4] = (byte)((2 * (word)r[0] + 3 * (word)r[1] + 2) / 5);
2314						r[5] = (byte)((1 * (word)r[0] + 4 * (word)r[1] + 2) / 5);
2315						r[6] = 0;
2316						r[7] = 0xFF;
2317					}
2318
2319					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2320					{
2321						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2322						{
2323							dest[(x + i) + (y + j) * internal.width] = r[(unsigned int)(source->rlut >> (16 + 3 * (i + j * 4))) % 8];
2324						}
2325					}
2326
2327					source++;
2328				}
2329			}
2330
2331			destSlice += internal.sliceB;
2332		}
2333	}
2334
2335	void Surface::decodeATI2(Buffer &internal, const Buffer &external)
2336	{
2337		word *destSlice = (word*)internal.buffer;
2338		const ATI2 *source = (const ATI2*)external.buffer;
2339
2340		for(int z = 0; z < external.depth; z++)
2341		{
2342			word *dest = destSlice;
2343
2344			for(int y = 0; y < external.height; y += 4)
2345			{
2346				for(int x = 0; x < external.width; x += 4)
2347				{
2348					byte X[8];
2349
2350					X[0] = source->x0;
2351					X[1] = source->x1;
2352
2353					if(X[0] > X[1])
2354					{
2355						X[2] = (byte)((6 * (word)X[0] + 1 * (word)X[1] + 3) / 7);
2356						X[3] = (byte)((5 * (word)X[0] + 2 * (word)X[1] + 3) / 7);
2357						X[4] = (byte)((4 * (word)X[0] + 3 * (word)X[1] + 3) / 7);
2358						X[5] = (byte)((3 * (word)X[0] + 4 * (word)X[1] + 3) / 7);
2359						X[6] = (byte)((2 * (word)X[0] + 5 * (word)X[1] + 3) / 7);
2360						X[7] = (byte)((1 * (word)X[0] + 6 * (word)X[1] + 3) / 7);
2361					}
2362					else
2363					{
2364						X[2] = (byte)((4 * (word)X[0] + 1 * (word)X[1] + 2) / 5);
2365						X[3] = (byte)((3 * (word)X[0] + 2 * (word)X[1] + 2) / 5);
2366						X[4] = (byte)((2 * (word)X[0] + 3 * (word)X[1] + 2) / 5);
2367						X[5] = (byte)((1 * (word)X[0] + 4 * (word)X[1] + 2) / 5);
2368						X[6] = 0;
2369						X[7] = 0xFF;
2370					}
2371
2372					byte Y[8];
2373
2374					Y[0] = source->y0;
2375					Y[1] = source->y1;
2376
2377					if(Y[0] > Y[1])
2378					{
2379						Y[2] = (byte)((6 * (word)Y[0] + 1 * (word)Y[1] + 3) / 7);
2380						Y[3] = (byte)((5 * (word)Y[0] + 2 * (word)Y[1] + 3) / 7);
2381						Y[4] = (byte)((4 * (word)Y[0] + 3 * (word)Y[1] + 3) / 7);
2382						Y[5] = (byte)((3 * (word)Y[0] + 4 * (word)Y[1] + 3) / 7);
2383						Y[6] = (byte)((2 * (word)Y[0] + 5 * (word)Y[1] + 3) / 7);
2384						Y[7] = (byte)((1 * (word)Y[0] + 6 * (word)Y[1] + 3) / 7);
2385					}
2386					else
2387					{
2388						Y[2] = (byte)((4 * (word)Y[0] + 1 * (word)Y[1] + 2) / 5);
2389						Y[3] = (byte)((3 * (word)Y[0] + 2 * (word)Y[1] + 2) / 5);
2390						Y[4] = (byte)((2 * (word)Y[0] + 3 * (word)Y[1] + 2) / 5);
2391						Y[5] = (byte)((1 * (word)Y[0] + 4 * (word)Y[1] + 2) / 5);
2392						Y[6] = 0;
2393						Y[7] = 0xFF;
2394					}
2395
2396					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2397					{
2398						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2399						{
2400							word r = X[(unsigned int)(source->xlut >> (16 + 3 * (i + j * 4))) % 8];
2401							word g = Y[(unsigned int)(source->ylut >> (16 + 3 * (i + j * 4))) % 8];
2402
2403							dest[(x + i) + (y + j) * internal.width] = (g << 8) + r;
2404						}
2405					}
2406
2407					source++;
2408				}
2409			}
2410
2411			(byte*&)destSlice += internal.sliceB;
2412		}
2413	}
2414
2415	void Surface::decodeETC2(Buffer &internal, const Buffer &external, int nbAlphaBits, bool isSRGB)
2416	{
2417		ETC_Decoder::Decode((const byte*)external.buffer, (byte*)internal.buffer, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
2418		                    (nbAlphaBits == 8) ? ETC_Decoder::ETC_RGBA : ((nbAlphaBits == 1) ? ETC_Decoder::ETC_RGB_PUNCHTHROUGH_ALPHA : ETC_Decoder::ETC_RGB));
2419
2420		if(isSRGB)
2421		{
2422			static byte sRGBtoLinearTable[256];
2423			static bool sRGBtoLinearTableDirty = true;
2424			if(sRGBtoLinearTableDirty)
2425			{
2426				for(int i = 0; i < 256; i++)
2427				{
2428					sRGBtoLinearTable[i] = static_cast<byte>(sRGBtoLinear(static_cast<float>(i) / 255.0f) * 255.0f + 0.5f);
2429				}
2430				sRGBtoLinearTableDirty = false;
2431			}
2432
2433			// Perform sRGB conversion in place after decoding
2434			byte* src = (byte*)internal.buffer;
2435			for(int y = 0; y < internal.height; y++)
2436			{
2437				byte* srcRow = src + y * internal.pitchB;
2438				for(int x = 0; x <  internal.width; x++)
2439				{
2440					byte* srcPix = srcRow + x * internal.bytes;
2441					for(int i = 0; i < 3; i++)
2442					{
2443						srcPix[i] = sRGBtoLinearTable[srcPix[i]];
2444					}
2445				}
2446			}
2447		}
2448	}
2449
2450	void Surface::decodeEAC(Buffer &internal, const Buffer &external, int nbChannels, bool isSigned)
2451	{
2452		ASSERT(nbChannels == 1 || nbChannels == 2);
2453
2454		ETC_Decoder::Decode((const byte*)external.buffer, (byte*)internal.buffer, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
2455		                    (nbChannels == 1) ? (isSigned ? ETC_Decoder::ETC_R_SIGNED : ETC_Decoder::ETC_R_UNSIGNED) : (isSigned ? ETC_Decoder::ETC_RG_SIGNED : ETC_Decoder::ETC_RG_UNSIGNED));
2456
2457		// FIXME: We convert signed data to float, until signed integer internal formats are supported
2458		//        This code can be removed if signed ETC2 images are decoded to internal 8 bit signed R/RG formats
2459		if(isSigned)
2460		{
2461			sbyte* src = (sbyte*)internal.buffer;
2462
2463			for(int y = 0; y < internal.height; y++)
2464			{
2465				sbyte* srcRow = src + y * internal.pitchB;
2466				for(int x = internal.width - 1; x >= 0; x--)
2467				{
2468					int dx = x & 0xFFFFFFFC;
2469					int mx = x - dx;
2470					sbyte* srcPix = srcRow + dx * internal.bytes + mx * nbChannels;
2471					float* dstPix = (float*)(srcRow + x * internal.bytes);
2472					for(int c = nbChannels - 1; c >= 0; c--)
2473					{
2474						static const float normalization = 1.0f / 127.875f;
2475						dstPix[c] = clamp(static_cast<float>(srcPix[c]) * normalization, -1.0f, 1.0f);
2476					}
2477				}
2478			}
2479		}
2480	}
2481
2482	void Surface::decodeASTC(Buffer &internal, const Buffer &external, int xBlockSize, int yBlockSize, int zBlockSize, bool isSRGB)
2483	{
2484	}
2485
2486	unsigned int Surface::size(int width, int height, int depth, Format format)
2487	{
2488		// Dimensions rounded up to multiples of 4, used for compressed formats
2489		int width4 = align(width, 4);
2490		int height4 = align(height, 4);
2491
2492		switch(format)
2493		{
2494		#if S3TC_SUPPORT
2495		case FORMAT_DXT1:
2496		#endif
2497		case FORMAT_ATI1:
2498		case FORMAT_ETC1:
2499		case FORMAT_R11_EAC:
2500		case FORMAT_SIGNED_R11_EAC:
2501		case FORMAT_RGB8_ETC2:
2502		case FORMAT_SRGB8_ETC2:
2503		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2504		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2505			return width4 * height4 * depth / 2;
2506		#if S3TC_SUPPORT
2507		case FORMAT_DXT3:
2508		case FORMAT_DXT5:
2509		#endif
2510		case FORMAT_ATI2:
2511		case FORMAT_RG11_EAC:
2512		case FORMAT_SIGNED_RG11_EAC:
2513		case FORMAT_RGBA8_ETC2_EAC:
2514		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
2515		case FORMAT_RGBA_ASTC_4x4_KHR:
2516		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
2517			return width4 * height4 * depth;
2518		case FORMAT_RGBA_ASTC_5x4_KHR:
2519		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
2520			return align(width, 5) * height4 * depth;
2521		case FORMAT_RGBA_ASTC_5x5_KHR:
2522		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
2523			return align(width, 5) * align(height, 5) * depth;
2524		case FORMAT_RGBA_ASTC_6x5_KHR:
2525		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
2526			return align(width, 6) * align(height, 5) * depth;
2527		case FORMAT_RGBA_ASTC_6x6_KHR:
2528		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
2529			return align(width, 6) * align(height, 6) * depth;
2530		case FORMAT_RGBA_ASTC_8x5_KHR:
2531		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
2532			return align(width, 8) * align(height, 5) * depth;
2533		case FORMAT_RGBA_ASTC_8x6_KHR:
2534		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
2535			return align(width, 8) * align(height, 6) * depth;
2536		case FORMAT_RGBA_ASTC_8x8_KHR:
2537		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
2538			return align(width, 8) * align(height, 8) * depth;
2539		case FORMAT_RGBA_ASTC_10x5_KHR:
2540		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
2541			return align(width, 10) * align(height, 5) * depth;
2542		case FORMAT_RGBA_ASTC_10x6_KHR:
2543		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
2544			return align(width, 10) * align(height, 6) * depth;
2545		case FORMAT_RGBA_ASTC_10x8_KHR:
2546		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
2547			return align(width, 10) * align(height, 8) * depth;
2548		case FORMAT_RGBA_ASTC_10x10_KHR:
2549		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
2550			return align(width, 10) * align(height, 10) * depth;
2551		case FORMAT_RGBA_ASTC_12x10_KHR:
2552		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
2553			return align(width, 12) * align(height, 10) * depth;
2554		case FORMAT_RGBA_ASTC_12x12_KHR:
2555		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
2556			return align(width, 12) * align(height, 12) * depth;
2557		case FORMAT_YV12_BT601:
2558		case FORMAT_YV12_BT709:
2559		case FORMAT_YV12_JFIF:
2560			{
2561				unsigned int YStride = align(width, 16);
2562				unsigned int YSize = YStride * height;
2563				unsigned int CStride = align(YStride / 2, 16);
2564 				unsigned int CSize = CStride * height / 2;
2565
2566				return YSize + 2 * CSize;
2567			}
2568		default:
2569			return bytes(format) * width * height * depth;
2570		}
2571
2572		return 0;
2573	}
2574
2575	bool Surface::isStencil(Format format)
2576	{
2577		switch(format)
2578		{
2579		case FORMAT_D32:
2580		case FORMAT_D16:
2581		case FORMAT_D24X8:
2582		case FORMAT_D32F:
2583		case FORMAT_D32F_COMPLEMENTARY:
2584		case FORMAT_D32F_LOCKABLE:
2585			return false;
2586		case FORMAT_D24S8:
2587		case FORMAT_D24FS8:
2588		case FORMAT_S8:
2589		case FORMAT_DF24S8:
2590		case FORMAT_DF16S8:
2591		case FORMAT_D32FS8_TEXTURE:
2592		case FORMAT_D32FS8_SHADOW:
2593		case FORMAT_INTZ:
2594			return true;
2595		default:
2596			return false;
2597		}
2598	}
2599
2600	bool Surface::isDepth(Format format)
2601	{
2602		switch(format)
2603		{
2604		case FORMAT_D32:
2605		case FORMAT_D16:
2606		case FORMAT_D24X8:
2607		case FORMAT_D24S8:
2608		case FORMAT_D24FS8:
2609		case FORMAT_D32F:
2610		case FORMAT_D32F_COMPLEMENTARY:
2611		case FORMAT_D32F_LOCKABLE:
2612		case FORMAT_DF24S8:
2613		case FORMAT_DF16S8:
2614		case FORMAT_D32FS8_TEXTURE:
2615		case FORMAT_D32FS8_SHADOW:
2616		case FORMAT_INTZ:
2617			return true;
2618		case FORMAT_S8:
2619			return false;
2620		default:
2621			return false;
2622		}
2623	}
2624
2625	bool Surface::isPalette(Format format)
2626	{
2627		switch(format)
2628		{
2629		case FORMAT_P8:
2630		case FORMAT_A8P8:
2631			return true;
2632		default:
2633			return false;
2634		}
2635	}
2636
2637	bool Surface::isFloatFormat(Format format)
2638	{
2639		switch(format)
2640		{
2641		case FORMAT_R5G6B5:
2642		case FORMAT_R8G8B8:
2643		case FORMAT_B8G8R8:
2644		case FORMAT_X8R8G8B8:
2645		case FORMAT_X8B8G8R8I:
2646		case FORMAT_X8B8G8R8:
2647		case FORMAT_A8R8G8B8:
2648		case FORMAT_A8B8G8R8I:
2649		case FORMAT_R8UI:
2650		case FORMAT_G8R8UI:
2651		case FORMAT_X8B8G8R8UI:
2652		case FORMAT_A8B8G8R8UI:
2653		case FORMAT_A8B8G8R8:
2654		case FORMAT_G8R8I:
2655		case FORMAT_G8R8:
2656		case FORMAT_A2B10G10R10:
2657		case FORMAT_R8I_SNORM:
2658		case FORMAT_G8R8I_SNORM:
2659		case FORMAT_X8B8G8R8I_SNORM:
2660		case FORMAT_A8B8G8R8I_SNORM:
2661		case FORMAT_R16I:
2662		case FORMAT_R16UI:
2663		case FORMAT_G16R16I:
2664		case FORMAT_G16R16UI:
2665		case FORMAT_G16R16:
2666		case FORMAT_X16B16G16R16I:
2667		case FORMAT_X16B16G16R16UI:
2668		case FORMAT_A16B16G16R16I:
2669		case FORMAT_A16B16G16R16UI:
2670		case FORMAT_A16B16G16R16:
2671		case FORMAT_V8U8:
2672		case FORMAT_Q8W8V8U8:
2673		case FORMAT_X8L8V8U8:
2674		case FORMAT_V16U16:
2675		case FORMAT_A16W16V16U16:
2676		case FORMAT_Q16W16V16U16:
2677		case FORMAT_A8:
2678		case FORMAT_R8I:
2679		case FORMAT_R8:
2680		case FORMAT_L8:
2681		case FORMAT_L16:
2682		case FORMAT_A8L8:
2683		case FORMAT_YV12_BT601:
2684		case FORMAT_YV12_BT709:
2685		case FORMAT_YV12_JFIF:
2686		case FORMAT_R32I:
2687		case FORMAT_R32UI:
2688		case FORMAT_G32R32I:
2689		case FORMAT_G32R32UI:
2690		case FORMAT_X32B32G32R32I:
2691		case FORMAT_X32B32G32R32UI:
2692		case FORMAT_A32B32G32R32I:
2693		case FORMAT_A32B32G32R32UI:
2694			return false;
2695		case FORMAT_R32F:
2696		case FORMAT_G32R32F:
2697		case FORMAT_X32B32G32R32F:
2698		case FORMAT_A32B32G32R32F:
2699		case FORMAT_D32F:
2700		case FORMAT_D32F_COMPLEMENTARY:
2701		case FORMAT_D32F_LOCKABLE:
2702		case FORMAT_D32FS8_TEXTURE:
2703		case FORMAT_D32FS8_SHADOW:
2704		case FORMAT_L16F:
2705		case FORMAT_A16L16F:
2706		case FORMAT_L32F:
2707		case FORMAT_A32L32F:
2708			return true;
2709		default:
2710			ASSERT(false);
2711		}
2712
2713		return false;
2714	}
2715
2716	bool Surface::isUnsignedComponent(Format format, int component)
2717	{
2718		switch(format)
2719		{
2720		case FORMAT_NULL:
2721		case FORMAT_R5G6B5:
2722		case FORMAT_R8G8B8:
2723		case FORMAT_B8G8R8:
2724		case FORMAT_X8R8G8B8:
2725		case FORMAT_X8B8G8R8:
2726		case FORMAT_A8R8G8B8:
2727		case FORMAT_A8B8G8R8:
2728		case FORMAT_G8R8:
2729		case FORMAT_A2B10G10R10:
2730		case FORMAT_R16UI:
2731		case FORMAT_G16R16:
2732		case FORMAT_G16R16UI:
2733		case FORMAT_X16B16G16R16UI:
2734		case FORMAT_A16B16G16R16:
2735		case FORMAT_A16B16G16R16UI:
2736		case FORMAT_R32UI:
2737		case FORMAT_G32R32UI:
2738		case FORMAT_X32B32G32R32UI:
2739		case FORMAT_A32B32G32R32UI:
2740		case FORMAT_R8UI:
2741		case FORMAT_G8R8UI:
2742		case FORMAT_X8B8G8R8UI:
2743		case FORMAT_A8B8G8R8UI:
2744		case FORMAT_D32F:
2745		case FORMAT_D32F_COMPLEMENTARY:
2746		case FORMAT_D32F_LOCKABLE:
2747		case FORMAT_D32FS8_TEXTURE:
2748		case FORMAT_D32FS8_SHADOW:
2749		case FORMAT_A8:
2750		case FORMAT_R8:
2751		case FORMAT_L8:
2752		case FORMAT_L16:
2753		case FORMAT_A8L8:
2754		case FORMAT_YV12_BT601:
2755		case FORMAT_YV12_BT709:
2756		case FORMAT_YV12_JFIF:
2757			return true;
2758		case FORMAT_A8B8G8R8I:
2759		case FORMAT_A16B16G16R16I:
2760		case FORMAT_A32B32G32R32I:
2761		case FORMAT_A8B8G8R8I_SNORM:
2762		case FORMAT_Q8W8V8U8:
2763		case FORMAT_Q16W16V16U16:
2764		case FORMAT_A32B32G32R32F:
2765			return false;
2766		case FORMAT_R32F:
2767		case FORMAT_R8I:
2768		case FORMAT_R16I:
2769		case FORMAT_R32I:
2770		case FORMAT_R8I_SNORM:
2771			return component >= 1;
2772		case FORMAT_V8U8:
2773		case FORMAT_X8L8V8U8:
2774		case FORMAT_V16U16:
2775		case FORMAT_G32R32F:
2776		case FORMAT_G8R8I:
2777		case FORMAT_G16R16I:
2778		case FORMAT_G32R32I:
2779		case FORMAT_G8R8I_SNORM:
2780			return component >= 2;
2781		case FORMAT_A16W16V16U16:
2782		case FORMAT_X32B32G32R32F:
2783		case FORMAT_X8B8G8R8I:
2784		case FORMAT_X16B16G16R16I:
2785		case FORMAT_X32B32G32R32I:
2786		case FORMAT_X8B8G8R8I_SNORM:
2787			return component >= 3;
2788		default:
2789			ASSERT(false);
2790		}
2791
2792		return false;
2793	}
2794
2795	bool Surface::isSRGBreadable(Format format)
2796	{
2797		// Keep in sync with Capabilities::isSRGBreadable
2798		switch(format)
2799		{
2800		case FORMAT_L8:
2801		case FORMAT_A8L8:
2802		case FORMAT_R8G8B8:
2803		case FORMAT_A8R8G8B8:
2804		case FORMAT_X8R8G8B8:
2805		case FORMAT_A8B8G8R8:
2806		case FORMAT_X8B8G8R8:
2807		case FORMAT_R5G6B5:
2808		case FORMAT_X1R5G5B5:
2809		case FORMAT_A1R5G5B5:
2810		case FORMAT_A4R4G4B4:
2811		#if S3TC_SUPPORT
2812		case FORMAT_DXT1:
2813		case FORMAT_DXT3:
2814		case FORMAT_DXT5:
2815		#endif
2816		case FORMAT_ATI1:
2817		case FORMAT_ATI2:
2818			return true;
2819		default:
2820			return false;
2821		}
2822
2823		return false;
2824	}
2825
2826	bool Surface::isSRGBwritable(Format format)
2827	{
2828		// Keep in sync with Capabilities::isSRGBwritable
2829		switch(format)
2830		{
2831		case FORMAT_NULL:
2832		case FORMAT_A8R8G8B8:
2833		case FORMAT_X8R8G8B8:
2834		case FORMAT_A8B8G8R8:
2835		case FORMAT_X8B8G8R8:
2836		case FORMAT_R5G6B5:
2837			return true;
2838		default:
2839			return false;
2840		}
2841	}
2842
2843	bool Surface::isCompressed(Format format)
2844	{
2845		switch(format)
2846		{
2847		#if S3TC_SUPPORT
2848		case FORMAT_DXT1:
2849		case FORMAT_DXT3:
2850		case FORMAT_DXT5:
2851		#endif
2852		case FORMAT_ATI1:
2853		case FORMAT_ATI2:
2854		case FORMAT_ETC1:
2855		case FORMAT_R11_EAC:
2856		case FORMAT_SIGNED_R11_EAC:
2857		case FORMAT_RG11_EAC:
2858		case FORMAT_SIGNED_RG11_EAC:
2859		case FORMAT_RGB8_ETC2:
2860		case FORMAT_SRGB8_ETC2:
2861		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2862		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2863		case FORMAT_RGBA8_ETC2_EAC:
2864		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
2865		case FORMAT_RGBA_ASTC_4x4_KHR:
2866		case FORMAT_RGBA_ASTC_5x4_KHR:
2867		case FORMAT_RGBA_ASTC_5x5_KHR:
2868		case FORMAT_RGBA_ASTC_6x5_KHR:
2869		case FORMAT_RGBA_ASTC_6x6_KHR:
2870		case FORMAT_RGBA_ASTC_8x5_KHR:
2871		case FORMAT_RGBA_ASTC_8x6_KHR:
2872		case FORMAT_RGBA_ASTC_8x8_KHR:
2873		case FORMAT_RGBA_ASTC_10x5_KHR:
2874		case FORMAT_RGBA_ASTC_10x6_KHR:
2875		case FORMAT_RGBA_ASTC_10x8_KHR:
2876		case FORMAT_RGBA_ASTC_10x10_KHR:
2877		case FORMAT_RGBA_ASTC_12x10_KHR:
2878		case FORMAT_RGBA_ASTC_12x12_KHR:
2879		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
2880		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
2881		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
2882		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
2883		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
2884		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
2885		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
2886		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
2887		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
2888		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
2889		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
2890		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
2891		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
2892		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
2893			return true;
2894		default:
2895			return false;
2896		}
2897	}
2898
2899	bool Surface::isNonNormalizedInteger(Format format)
2900	{
2901		switch(format)
2902		{
2903		case FORMAT_A8B8G8R8I:
2904		case FORMAT_X8B8G8R8I:
2905		case FORMAT_G8R8I:
2906		case FORMAT_R8I:
2907		case FORMAT_A8B8G8R8UI:
2908		case FORMAT_X8B8G8R8UI:
2909		case FORMAT_G8R8UI:
2910		case FORMAT_R8UI:
2911		case FORMAT_A16B16G16R16I:
2912		case FORMAT_X16B16G16R16I:
2913		case FORMAT_G16R16I:
2914		case FORMAT_R16I:
2915		case FORMAT_A16B16G16R16UI:
2916		case FORMAT_X16B16G16R16UI:
2917		case FORMAT_G16R16UI:
2918		case FORMAT_R16UI:
2919		case FORMAT_A32B32G32R32I:
2920		case FORMAT_X32B32G32R32I:
2921		case FORMAT_G32R32I:
2922		case FORMAT_R32I:
2923		case FORMAT_A32B32G32R32UI:
2924		case FORMAT_X32B32G32R32UI:
2925		case FORMAT_G32R32UI:
2926		case FORMAT_R32UI:
2927			return true;
2928		default:
2929			return false;
2930		}
2931	}
2932
2933	int Surface::componentCount(Format format)
2934	{
2935		switch(format)
2936		{
2937		case FORMAT_R5G6B5:         return 3;
2938		case FORMAT_X8R8G8B8:       return 3;
2939		case FORMAT_X8B8G8R8I:      return 3;
2940		case FORMAT_X8B8G8R8:       return 3;
2941		case FORMAT_A8R8G8B8:       return 4;
2942		case FORMAT_A8B8G8R8I:      return 4;
2943		case FORMAT_A8B8G8R8:       return 4;
2944		case FORMAT_G8R8I:          return 2;
2945		case FORMAT_G8R8:           return 2;
2946		case FORMAT_R8I_SNORM:      return 1;
2947		case FORMAT_G8R8I_SNORM:    return 2;
2948		case FORMAT_X8B8G8R8I_SNORM:return 3;
2949		case FORMAT_A8B8G8R8I_SNORM:return 4;
2950		case FORMAT_R8UI:           return 1;
2951		case FORMAT_G8R8UI:         return 2;
2952		case FORMAT_X8B8G8R8UI:     return 3;
2953		case FORMAT_A8B8G8R8UI:     return 4;
2954		case FORMAT_A2B10G10R10:    return 4;
2955		case FORMAT_G16R16I:        return 2;
2956		case FORMAT_G16R16UI:       return 2;
2957		case FORMAT_G16R16:         return 2;
2958		case FORMAT_G32R32I:        return 2;
2959		case FORMAT_G32R32UI:       return 2;
2960		case FORMAT_X16B16G16R16I:  return 3;
2961		case FORMAT_X16B16G16R16UI: return 3;
2962		case FORMAT_A16B16G16R16I:  return 4;
2963		case FORMAT_A16B16G16R16UI: return 4;
2964		case FORMAT_A16B16G16R16:   return 4;
2965		case FORMAT_X32B32G32R32I:  return 3;
2966		case FORMAT_X32B32G32R32UI: return 3;
2967		case FORMAT_A32B32G32R32I:  return 4;
2968		case FORMAT_A32B32G32R32UI: return 4;
2969		case FORMAT_V8U8:           return 2;
2970		case FORMAT_Q8W8V8U8:       return 4;
2971		case FORMAT_X8L8V8U8:       return 3;
2972		case FORMAT_V16U16:         return 2;
2973		case FORMAT_A16W16V16U16:   return 4;
2974		case FORMAT_Q16W16V16U16:   return 4;
2975		case FORMAT_R32F:           return 1;
2976		case FORMAT_G32R32F:        return 2;
2977		case FORMAT_X32B32G32R32F:  return 3;
2978		case FORMAT_A32B32G32R32F:  return 4;
2979		case FORMAT_D32F:           return 1;
2980		case FORMAT_D32F_LOCKABLE:  return 1;
2981		case FORMAT_D32FS8_TEXTURE: return 1;
2982		case FORMAT_D32FS8_SHADOW:  return 1;
2983		case FORMAT_A8:             return 1;
2984		case FORMAT_R8I:            return 1;
2985		case FORMAT_R8:             return 1;
2986		case FORMAT_R16I:           return 1;
2987		case FORMAT_R16UI:          return 1;
2988		case FORMAT_R32I:           return 1;
2989		case FORMAT_R32UI:          return 1;
2990		case FORMAT_L8:             return 1;
2991		case FORMAT_L16:            return 1;
2992		case FORMAT_A8L8:           return 2;
2993		case FORMAT_YV12_BT601:     return 3;
2994		case FORMAT_YV12_BT709:     return 3;
2995		case FORMAT_YV12_JFIF:      return 3;
2996		default:
2997			ASSERT(false);
2998		}
2999
3000		return 1;
3001	}
3002
3003	void *Surface::allocateBuffer(int width, int height, int depth, Format format)
3004	{
3005		// Render targets require 2x2 quads
3006		int width2 = (width + 1) & ~1;
3007		int height2 = (height + 1) & ~1;
3008
3009		// FIXME: Unpacking byte4 to short4 in the sampler currently involves reading 8 bytes,
3010		// so we have to allocate 4 extra bytes to avoid buffer overruns.
3011		return allocateZero(size(width2, height2, depth, format) + 4);
3012	}
3013
3014	void Surface::memfill4(void *buffer, int pattern, int bytes)
3015	{
3016		while((size_t)buffer & 0x1 && bytes >= 1)
3017		{
3018			*(char*)buffer = (char)pattern;
3019			(char*&)buffer += 1;
3020			bytes -= 1;
3021		}
3022
3023		while((size_t)buffer & 0x3 && bytes >= 2)
3024		{
3025			*(short*)buffer = (short)pattern;
3026			(short*&)buffer += 1;
3027			bytes -= 2;
3028		}
3029
3030		if(CPUID::supportsSSE())
3031		{
3032			while((size_t)buffer & 0xF && bytes >= 4)
3033			{
3034				*(int*)buffer = pattern;
3035				(int*&)buffer += 1;
3036				bytes -= 4;
3037			}
3038
3039			__m128 quad = _mm_set_ps1((float&)pattern);
3040
3041			float *pointer = (float*)buffer;
3042			int qxwords = bytes / 64;
3043			bytes -= qxwords * 64;
3044
3045			while(qxwords--)
3046			{
3047				_mm_stream_ps(pointer + 0, quad);
3048				_mm_stream_ps(pointer + 4, quad);
3049				_mm_stream_ps(pointer + 8, quad);
3050				_mm_stream_ps(pointer + 12, quad);
3051
3052				pointer += 16;
3053			}
3054
3055			buffer = pointer;
3056		}
3057
3058		while(bytes >= 4)
3059		{
3060			*(int*)buffer = (int)pattern;
3061			(int*&)buffer += 1;
3062			bytes -= 4;
3063		}
3064
3065		while(bytes >= 2)
3066		{
3067			*(short*)buffer = (short)pattern;
3068			(short*&)buffer += 1;
3069			bytes -= 2;
3070		}
3071
3072		while(bytes >= 1)
3073		{
3074			*(char*)buffer = (char)pattern;
3075			(char*&)buffer += 1;
3076			bytes -= 1;
3077		}
3078	}
3079
3080	bool Surface::isEntire(const SliceRect& rect) const
3081	{
3082		return (rect.x0 == 0 && rect.y0 == 0 && rect.x1 == internal.width && rect.y1 == internal.height && internal.depth == 1);
3083	}
3084
3085	SliceRect Surface::getRect() const
3086	{
3087		return SliceRect(0, 0, internal.width, internal.height, 0);
3088	}
3089
3090	void Surface::clearDepth(float depth, int x0, int y0, int width, int height)
3091	{
3092		if(width == 0 || height == 0) return;
3093
3094		// Not overlapping
3095		if(x0 > internal.width) return;
3096		if(y0 > internal.height) return;
3097		if(x0 + width < 0) return;
3098		if(y0 + height < 0) return;
3099
3100		// Clip against dimensions
3101		if(x0 < 0) {width += x0; x0 = 0;}
3102		if(x0 + width > internal.width) width = internal.width - x0;
3103		if(y0 < 0) {height += y0; y0 = 0;}
3104		if(y0 + height > internal.height) height = internal.height - y0;
3105
3106		const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
3107		const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
3108
3109		int width2 = (internal.width + 1) & ~1;
3110
3111		int x1 = x0 + width;
3112		int y1 = y0 + height;
3113
3114		if(internal.format == FORMAT_D32F_LOCKABLE ||
3115		   internal.format == FORMAT_D32FS8_TEXTURE ||
3116		   internal.format == FORMAT_D32FS8_SHADOW)
3117		{
3118			float *target = (float*)lockInternal(0, 0, 0, lock, PUBLIC) + x0 + width2 * y0;
3119
3120			for(int z = 0; z < internal.depth; z++)
3121			{
3122				for(int y = y0; y < y1; y++)
3123				{
3124					memfill4(target, (int&)depth, 4 * width);
3125					target += width2;
3126				}
3127			}
3128
3129			unlockInternal();
3130		}
3131		else   // Quad layout
3132		{
3133			if(complementaryDepthBuffer)
3134			{
3135				depth = 1 - depth;
3136			}
3137
3138			float *buffer = (float*)lockInternal(0, 0, 0, lock, PUBLIC);
3139
3140			int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
3141			int oddX1 = (x1 & ~1) * 2;
3142			int evenX0 = ((x0 + 1) & ~1) * 2;
3143			int evenBytes = (oddX1 - evenX0) * sizeof(float);
3144
3145			for(int z = 0; z < internal.depth; z++)
3146			{
3147				for(int y = y0; y < y1; y++)
3148				{
3149					float *target = buffer + (y & ~1) * width2 + (y & 1) * 2;
3150
3151					if((y & 1) == 0 && y + 1 < y1)   // Fill quad line at once
3152					{
3153						if((x0 & 1) != 0)
3154						{
3155							target[oddX0 + 0] = depth;
3156							target[oddX0 + 2] = depth;
3157						}
3158
3159					//	for(int x2 = evenX0; x2 < x1 * 2; x2 += 4)
3160					//	{
3161					//		target[x2 + 0] = depth;
3162					//		target[x2 + 1] = depth;
3163					//		target[x2 + 2] = depth;
3164					//		target[x2 + 3] = depth;
3165					//	}
3166
3167					//	__asm
3168					//	{
3169					//		movss xmm0, depth
3170					//		shufps xmm0, xmm0, 0x00
3171					//
3172					//		mov eax, x0
3173					//		add eax, 1
3174					//		and eax, 0xFFFFFFFE
3175					//		cmp eax, x1
3176					//		jge qEnd
3177					//
3178					//		mov edi, target
3179					//
3180					//	qLoop:
3181					//		movntps [edi+8*eax], xmm0
3182					//
3183					//		add eax, 2
3184					//		cmp eax, x1
3185					//		jl qLoop
3186					//	qEnd:
3187					//	}
3188
3189						memfill4(&target[evenX0], (int&)depth, evenBytes);
3190
3191						if((x1 & 1) != 0)
3192						{
3193							target[oddX1 + 0] = depth;
3194							target[oddX1 + 2] = depth;
3195						}
3196
3197						y++;
3198					}
3199					else
3200					{
3201						for(int x = x0, i = oddX0; x < x1; x++, i = (x & ~1) * 2 + (x & 1))
3202						{
3203							target[i] = depth;
3204						}
3205					}
3206				}
3207
3208				buffer += internal.sliceP;
3209			}
3210
3211			unlockInternal();
3212		}
3213	}
3214
3215	void Surface::clearStencil(unsigned char s, unsigned char mask, int x0, int y0, int width, int height)
3216	{
3217		if(mask == 0 || width == 0 || height == 0) return;
3218
3219		// Not overlapping
3220		if(x0 > internal.width) return;
3221		if(y0 > internal.height) return;
3222		if(x0 + width < 0) return;
3223		if(y0 + height < 0) return;
3224
3225		// Clip against dimensions
3226		if(x0 < 0) {width += x0; x0 = 0;}
3227		if(x0 + width > internal.width) width = internal.width - x0;
3228		if(y0 < 0) {height += y0; y0 = 0;}
3229		if(y0 + height > internal.height) height = internal.height - y0;
3230
3231		int width2 = (internal.width + 1) & ~1;
3232
3233		int x1 = x0 + width;
3234		int y1 = y0 + height;
3235
3236		int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
3237		int oddX1 = (x1 & ~1) * 2;
3238		int evenX0 = ((x0 + 1) & ~1) * 2;
3239		int evenBytes = oddX1 - evenX0;
3240
3241		unsigned char maskedS = s & mask;
3242		unsigned char invMask = ~mask;
3243		unsigned int fill = maskedS;
3244		fill = fill | (fill << 8) | (fill << 16) + (fill << 24);
3245
3246		char *buffer = (char*)lockStencil(0, PUBLIC);
3247
3248		// Stencil buffers are assumed to use quad layout
3249		for(int z = 0; z < stencil.depth; z++)
3250		{
3251			for(int y = y0; y < y1; y++)
3252			{
3253				char *target = buffer + (y & ~1) * width2 + (y & 1) * 2;
3254
3255				if((y & 1) == 0 && y + 1 < y1 && mask == 0xFF)   // Fill quad line at once
3256				{
3257					if((x0 & 1) != 0)
3258					{
3259						target[oddX0 + 0] = fill;
3260						target[oddX0 + 2] = fill;
3261					}
3262
3263					memfill4(&target[evenX0], fill, evenBytes);
3264
3265					if((x1 & 1) != 0)
3266					{
3267						target[oddX1 + 0] = fill;
3268						target[oddX1 + 2] = fill;
3269					}
3270
3271					y++;
3272				}
3273				else
3274				{
3275					for(int x = x0, i = oddX0; x < x1; x++, i = (x & ~1) * 2 + (x & 1))
3276					{
3277						target[i] = maskedS | (target[i] & invMask);
3278					}
3279				}
3280			}
3281
3282			buffer += stencil.sliceP;
3283		}
3284
3285		unlockStencil();
3286	}
3287
3288	void Surface::fill(const Color<float> &color, int x0, int y0, int width, int height)
3289	{
3290		unsigned char *row;
3291		Buffer *buffer;
3292
3293		if(internal.dirty)
3294		{
3295			row = (unsigned char*)lockInternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3296			buffer = &internal;
3297		}
3298		else
3299		{
3300			row = (unsigned char*)lockExternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3301			buffer = &external;
3302		}
3303
3304		if(buffer->bytes <= 4)
3305		{
3306			int c;
3307			buffer->write(&c, color);
3308
3309			if(buffer->bytes <= 1) c = (c << 8)  | c;
3310			if(buffer->bytes <= 2) c = (c << 16) | c;
3311
3312			for(int y = 0; y < height; y++)
3313			{
3314				memfill4(row, c, width * buffer->bytes);
3315
3316				row += buffer->pitchB;
3317			}
3318		}
3319		else   // Generic
3320		{
3321			for(int y = 0; y < height; y++)
3322			{
3323				unsigned char *element = row;
3324
3325				for(int x = 0; x < width; x++)
3326				{
3327					buffer->write(element, color);
3328
3329					element += buffer->bytes;
3330				}
3331
3332				row += buffer->pitchB;
3333			}
3334		}
3335
3336		if(buffer == &internal)
3337		{
3338			unlockInternal();
3339		}
3340		else
3341		{
3342			unlockExternal();
3343		}
3344	}
3345
3346	void Surface::copyInternal(const Surface* source, int x, int y, float srcX, float srcY, bool filter)
3347	{
3348		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
3349
3350		sw::Color<float> color;
3351
3352		if(!filter)
3353		{
3354			color = source->internal.read((int)srcX, (int)srcY);
3355		}
3356		else   // Bilinear filtering
3357		{
3358			color = source->internal.sample(srcX, srcY);
3359		}
3360
3361		internal.write(x, y, color);
3362	}
3363
3364	void Surface::copyInternal(const Surface* source, int x, int y, int z, float srcX, float srcY, float srcZ, bool filter)
3365	{
3366		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
3367
3368		sw::Color<float> color;
3369
3370		if(!filter)
3371		{
3372			color = source->internal.read((int)srcX, (int)srcY, int(srcZ));
3373		}
3374		else   // Bilinear filtering
3375		{
3376			color = source->internal.sample(srcX, srcY, srcZ);
3377		}
3378
3379		internal.write(x, y, z, color);
3380	}
3381
3382	bool Surface::hasStencil() const
3383	{
3384		return isStencil(external.format);
3385	}
3386
3387	bool Surface::hasDepth() const
3388	{
3389		return isDepth(external.format);
3390	}
3391
3392	bool Surface::hasPalette() const
3393	{
3394		return isPalette(external.format);
3395	}
3396
3397	bool Surface::isRenderTarget() const
3398	{
3399		return renderTarget;
3400	}
3401
3402	bool Surface::hasDirtyMipmaps() const
3403	{
3404		return dirtyMipmaps;
3405	}
3406
3407	void Surface::cleanMipmaps()
3408	{
3409		dirtyMipmaps = false;
3410	}
3411
3412	Resource *Surface::getResource()
3413	{
3414		return resource;
3415	}
3416
3417	bool Surface::identicalFormats() const
3418	{
3419		return external.format == internal.format &&
3420		       external.width  == internal.width &&
3421		       external.height == internal.height &&
3422		       external.depth  == internal.depth &&
3423		       external.pitchB == internal.pitchB &&
3424		       external.sliceB == internal.sliceB;
3425	}
3426
3427	Format Surface::selectInternalFormat(Format format) const
3428	{
3429		switch(format)
3430		{
3431		case FORMAT_NULL:
3432			return FORMAT_NULL;
3433		case FORMAT_P8:
3434		case FORMAT_A8P8:
3435		case FORMAT_A4R4G4B4:
3436		case FORMAT_A1R5G5B5:
3437		case FORMAT_A8R3G3B2:
3438			return FORMAT_A8R8G8B8;
3439		case FORMAT_A8:
3440			return FORMAT_A8;
3441		case FORMAT_R8I:
3442			return FORMAT_R8I;
3443		case FORMAT_R8UI:
3444			return FORMAT_R8UI;
3445		case FORMAT_R8I_SNORM:
3446			return FORMAT_R8I_SNORM;
3447		case FORMAT_R8:
3448			return FORMAT_R8;
3449		case FORMAT_R16I:
3450			return FORMAT_R16I;
3451		case FORMAT_R16UI:
3452			return FORMAT_R16UI;
3453		case FORMAT_R32I:
3454			return FORMAT_R32I;
3455		case FORMAT_R32UI:
3456			return FORMAT_R32UI;
3457		case FORMAT_X16B16G16R16I:
3458		case FORMAT_A16B16G16R16I:
3459			return FORMAT_A16B16G16R16I;
3460		case FORMAT_X16B16G16R16UI:
3461		case FORMAT_A16B16G16R16UI:
3462			return FORMAT_A16B16G16R16UI;
3463		case FORMAT_A2R10G10B10:
3464		case FORMAT_A2B10G10R10:
3465		case FORMAT_A16B16G16R16:
3466			return FORMAT_A16B16G16R16;
3467		case FORMAT_X32B32G32R32I:
3468		case FORMAT_A32B32G32R32I:
3469			return FORMAT_A32B32G32R32I;
3470		case FORMAT_X32B32G32R32UI:
3471		case FORMAT_A32B32G32R32UI:
3472			return FORMAT_A32B32G32R32UI;
3473		case FORMAT_G8R8I:
3474			return FORMAT_G8R8I;
3475		case FORMAT_G8R8UI:
3476			return FORMAT_G8R8UI;
3477		case FORMAT_G8R8I_SNORM:
3478			return FORMAT_G8R8I_SNORM;
3479		case FORMAT_G8R8:
3480			return FORMAT_G8R8;
3481		case FORMAT_G16R16I:
3482			return FORMAT_G16R16I;
3483		case FORMAT_G16R16UI:
3484			return FORMAT_G16R16UI;
3485		case FORMAT_G16R16:
3486			return FORMAT_G16R16;
3487		case FORMAT_G32R32I:
3488			return FORMAT_G32R32I;
3489		case FORMAT_G32R32UI:
3490			return FORMAT_G32R32UI;
3491		case FORMAT_A8R8G8B8:
3492			if(lockable || !quadLayoutEnabled)
3493			{
3494				return FORMAT_A8R8G8B8;
3495			}
3496			else
3497			{
3498				return FORMAT_A8G8R8B8Q;
3499			}
3500		case FORMAT_A8B8G8R8I:
3501			return FORMAT_A8B8G8R8I;
3502		case FORMAT_A8B8G8R8UI:
3503			return FORMAT_A8B8G8R8UI;
3504		case FORMAT_A8B8G8R8I_SNORM:
3505			return FORMAT_A8B8G8R8I_SNORM;
3506		case FORMAT_R5G5B5A1:
3507		case FORMAT_R4G4B4A4:
3508		case FORMAT_A8B8G8R8:
3509			return FORMAT_A8B8G8R8;
3510		case FORMAT_R5G6B5:
3511			return FORMAT_R5G6B5;
3512		case FORMAT_R3G3B2:
3513		case FORMAT_R8G8B8:
3514		case FORMAT_X4R4G4B4:
3515		case FORMAT_X1R5G5B5:
3516		case FORMAT_X8R8G8B8:
3517			if(lockable || !quadLayoutEnabled)
3518			{
3519				return FORMAT_X8R8G8B8;
3520			}
3521			else
3522			{
3523				return FORMAT_X8G8R8B8Q;
3524			}
3525		case FORMAT_X8B8G8R8I:
3526			return FORMAT_X8B8G8R8I;
3527		case FORMAT_X8B8G8R8UI:
3528			return FORMAT_X8B8G8R8UI;
3529		case FORMAT_X8B8G8R8I_SNORM:
3530			return FORMAT_X8B8G8R8I_SNORM;
3531		case FORMAT_B8G8R8:
3532		case FORMAT_X8B8G8R8:
3533			return FORMAT_X8B8G8R8;
3534		// Compressed formats
3535		#if S3TC_SUPPORT
3536		case FORMAT_DXT1:
3537		case FORMAT_DXT3:
3538		case FORMAT_DXT5:
3539		#endif
3540		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3541		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3542		case FORMAT_RGBA8_ETC2_EAC:
3543		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
3544		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
3545		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
3546		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
3547		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
3548		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
3549		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
3550		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
3551		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
3552		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
3553		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
3554		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
3555		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
3556		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
3557		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
3558			return FORMAT_A8R8G8B8;
3559		case FORMAT_RGBA_ASTC_4x4_KHR:
3560		case FORMAT_RGBA_ASTC_5x4_KHR:
3561		case FORMAT_RGBA_ASTC_5x5_KHR:
3562		case FORMAT_RGBA_ASTC_6x5_KHR:
3563		case FORMAT_RGBA_ASTC_6x6_KHR:
3564		case FORMAT_RGBA_ASTC_8x5_KHR:
3565		case FORMAT_RGBA_ASTC_8x6_KHR:
3566		case FORMAT_RGBA_ASTC_8x8_KHR:
3567		case FORMAT_RGBA_ASTC_10x5_KHR:
3568		case FORMAT_RGBA_ASTC_10x6_KHR:
3569		case FORMAT_RGBA_ASTC_10x8_KHR:
3570		case FORMAT_RGBA_ASTC_10x10_KHR:
3571		case FORMAT_RGBA_ASTC_12x10_KHR:
3572		case FORMAT_RGBA_ASTC_12x12_KHR:
3573			// ASTC supports HDR, so a floating point format is required to represent it properly
3574			return FORMAT_A32B32G32R32F; // FIXME: 16FP is probably sufficient, but it's currently unsupported
3575		case FORMAT_ATI1:
3576		case FORMAT_R11_EAC:
3577			return FORMAT_R8;
3578		case FORMAT_SIGNED_R11_EAC:
3579			return FORMAT_R32F; // FIXME: Signed 8bit format would be sufficient
3580		case FORMAT_ATI2:
3581		case FORMAT_RG11_EAC:
3582			return FORMAT_G8R8;
3583		case FORMAT_SIGNED_RG11_EAC:
3584			return FORMAT_G32R32F; // FIXME: Signed 8bit format would be sufficient
3585		case FORMAT_ETC1:
3586		case FORMAT_RGB8_ETC2:
3587		case FORMAT_SRGB8_ETC2:
3588			return FORMAT_X8R8G8B8;
3589		// Bumpmap formats
3590		case FORMAT_V8U8:			return FORMAT_V8U8;
3591		case FORMAT_L6V5U5:			return FORMAT_X8L8V8U8;
3592		case FORMAT_Q8W8V8U8:		return FORMAT_Q8W8V8U8;
3593		case FORMAT_X8L8V8U8:		return FORMAT_X8L8V8U8;
3594		case FORMAT_V16U16:			return FORMAT_V16U16;
3595		case FORMAT_A2W10V10U10:	return FORMAT_A16W16V16U16;
3596		case FORMAT_Q16W16V16U16:	return FORMAT_Q16W16V16U16;
3597		// Floating-point formats
3598		case FORMAT_A16F:			return FORMAT_A32B32G32R32F;
3599		case FORMAT_R16F:			return FORMAT_R32F;
3600		case FORMAT_G16R16F:		return FORMAT_G32R32F;
3601		case FORMAT_B16G16R16F:     return FORMAT_X32B32G32R32F;
3602		case FORMAT_A16B16G16R16F:	return FORMAT_A32B32G32R32F;
3603		case FORMAT_A32F:			return FORMAT_A32B32G32R32F;
3604		case FORMAT_R32F:			return FORMAT_R32F;
3605		case FORMAT_G32R32F:		return FORMAT_G32R32F;
3606		case FORMAT_B32G32R32F:     return FORMAT_X32B32G32R32F;
3607		case FORMAT_X32B32G32R32F:  return FORMAT_X32B32G32R32F;
3608		case FORMAT_A32B32G32R32F:	return FORMAT_A32B32G32R32F;
3609		// Luminance formats
3610		case FORMAT_L8:				return FORMAT_L8;
3611		case FORMAT_A4L4:			return FORMAT_A8L8;
3612		case FORMAT_L16:			return FORMAT_L16;
3613		case FORMAT_A8L8:			return FORMAT_A8L8;
3614		case FORMAT_L16F:           return FORMAT_X32B32G32R32F;
3615		case FORMAT_A16L16F:        return FORMAT_A32B32G32R32F;
3616		case FORMAT_L32F:           return FORMAT_X32B32G32R32F;
3617		case FORMAT_A32L32F:        return FORMAT_A32B32G32R32F;
3618		// Depth/stencil formats
3619		case FORMAT_D16:
3620		case FORMAT_D32:
3621		case FORMAT_D24X8:
3622		case FORMAT_D24S8:
3623		case FORMAT_D24FS8:
3624			if(hasParent)   // Texture
3625			{
3626				return FORMAT_D32FS8_SHADOW;
3627			}
3628			else if(complementaryDepthBuffer)
3629			{
3630				return FORMAT_D32F_COMPLEMENTARY;
3631			}
3632			else
3633			{
3634				return FORMAT_D32F;
3635			}
3636		case FORMAT_D32F:           return FORMAT_D32F;
3637		case FORMAT_D32F_LOCKABLE:  return FORMAT_D32F_LOCKABLE;
3638		case FORMAT_D32FS8_TEXTURE: return FORMAT_D32FS8_TEXTURE;
3639		case FORMAT_INTZ:           return FORMAT_D32FS8_TEXTURE;
3640		case FORMAT_DF24S8:         return FORMAT_D32FS8_SHADOW;
3641		case FORMAT_DF16S8:         return FORMAT_D32FS8_SHADOW;
3642		case FORMAT_YV12_BT601:     return FORMAT_YV12_BT601;
3643		case FORMAT_YV12_BT709:     return FORMAT_YV12_BT709;
3644		case FORMAT_YV12_JFIF:      return FORMAT_YV12_JFIF;
3645		default:
3646			ASSERT(false);
3647		}
3648
3649		return FORMAT_NULL;
3650	}
3651
3652	void Surface::setTexturePalette(unsigned int *palette)
3653	{
3654		Surface::palette = palette;
3655		Surface::paletteID++;
3656	}
3657
3658	void Surface::resolve()
3659	{
3660		if(internal.depth <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL)
3661		{
3662			return;
3663		}
3664
3665		void *source = internal.lockRect(0, 0, 0, LOCK_READWRITE);
3666
3667		int quality = internal.depth;
3668		int width = internal.width;
3669		int height = internal.height;
3670		int pitch = internal.pitchB;
3671		int slice = internal.sliceB;
3672
3673		unsigned char *source0 = (unsigned char*)source;
3674		unsigned char *source1 = source0 + slice;
3675		unsigned char *source2 = source1 + slice;
3676		unsigned char *source3 = source2 + slice;
3677		unsigned char *source4 = source3 + slice;
3678		unsigned char *source5 = source4 + slice;
3679		unsigned char *source6 = source5 + slice;
3680		unsigned char *source7 = source6 + slice;
3681		unsigned char *source8 = source7 + slice;
3682		unsigned char *source9 = source8 + slice;
3683		unsigned char *sourceA = source9 + slice;
3684		unsigned char *sourceB = sourceA + slice;
3685		unsigned char *sourceC = sourceB + slice;
3686		unsigned char *sourceD = sourceC + slice;
3687		unsigned char *sourceE = sourceD + slice;
3688		unsigned char *sourceF = sourceE + slice;
3689
3690		if(internal.format == FORMAT_X8R8G8B8 || internal.format == FORMAT_A8R8G8B8 || internal.format == FORMAT_X8B8G8R8 || internal.format == FORMAT_A8B8G8R8)
3691		{
3692			if(CPUID::supportsSSE2() && (width % 4) == 0)
3693			{
3694				if(internal.depth == 2)
3695				{
3696					for(int y = 0; y < height; y++)
3697					{
3698						for(int x = 0; x < width; x += 4)
3699						{
3700							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3701							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3702
3703							c0 = _mm_avg_epu8(c0, c1);
3704
3705							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3706						}
3707
3708						source0 += pitch;
3709						source1 += pitch;
3710					}
3711				}
3712				else if(internal.depth == 4)
3713				{
3714					for(int y = 0; y < height; y++)
3715					{
3716						for(int x = 0; x < width; x += 4)
3717						{
3718							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3719							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3720							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3721							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3722
3723							c0 = _mm_avg_epu8(c0, c1);
3724							c2 = _mm_avg_epu8(c2, c3);
3725							c0 = _mm_avg_epu8(c0, c2);
3726
3727							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3728						}
3729
3730						source0 += pitch;
3731						source1 += pitch;
3732						source2 += pitch;
3733						source3 += pitch;
3734					}
3735				}
3736				else if(internal.depth == 8)
3737				{
3738					for(int y = 0; y < height; y++)
3739					{
3740						for(int x = 0; x < width; x += 4)
3741						{
3742							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3743							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3744							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3745							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3746							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3747							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3748							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3749							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3750
3751							c0 = _mm_avg_epu8(c0, c1);
3752							c2 = _mm_avg_epu8(c2, c3);
3753							c4 = _mm_avg_epu8(c4, c5);
3754							c6 = _mm_avg_epu8(c6, c7);
3755							c0 = _mm_avg_epu8(c0, c2);
3756							c4 = _mm_avg_epu8(c4, c6);
3757							c0 = _mm_avg_epu8(c0, c4);
3758
3759							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3760						}
3761
3762						source0 += pitch;
3763						source1 += pitch;
3764						source2 += pitch;
3765						source3 += pitch;
3766						source4 += pitch;
3767						source5 += pitch;
3768						source6 += pitch;
3769						source7 += pitch;
3770					}
3771				}
3772				else if(internal.depth == 16)
3773				{
3774					for(int y = 0; y < height; y++)
3775					{
3776						for(int x = 0; x < width; x += 4)
3777						{
3778							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3779							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3780							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3781							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3782							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3783							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3784							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3785							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3786							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
3787							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
3788							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
3789							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
3790							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
3791							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
3792							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
3793							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
3794
3795							c0 = _mm_avg_epu8(c0, c1);
3796							c2 = _mm_avg_epu8(c2, c3);
3797							c4 = _mm_avg_epu8(c4, c5);
3798							c6 = _mm_avg_epu8(c6, c7);
3799							c8 = _mm_avg_epu8(c8, c9);
3800							cA = _mm_avg_epu8(cA, cB);
3801							cC = _mm_avg_epu8(cC, cD);
3802							cE = _mm_avg_epu8(cE, cF);
3803							c0 = _mm_avg_epu8(c0, c2);
3804							c4 = _mm_avg_epu8(c4, c6);
3805							c8 = _mm_avg_epu8(c8, cA);
3806							cC = _mm_avg_epu8(cC, cE);
3807							c0 = _mm_avg_epu8(c0, c4);
3808							c8 = _mm_avg_epu8(c8, cC);
3809							c0 = _mm_avg_epu8(c0, c8);
3810
3811							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3812						}
3813
3814						source0 += pitch;
3815						source1 += pitch;
3816						source2 += pitch;
3817						source3 += pitch;
3818						source4 += pitch;
3819						source5 += pitch;
3820						source6 += pitch;
3821						source7 += pitch;
3822						source8 += pitch;
3823						source9 += pitch;
3824						sourceA += pitch;
3825						sourceB += pitch;
3826						sourceC += pitch;
3827						sourceD += pitch;
3828						sourceE += pitch;
3829						sourceF += pitch;
3830					}
3831				}
3832				else ASSERT(false);
3833			}
3834			else
3835			{
3836				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))
3837
3838				if(internal.depth == 2)
3839				{
3840					for(int y = 0; y < height; y++)
3841					{
3842						for(int x = 0; x < width; x++)
3843						{
3844							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3845							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3846
3847							c0 = AVERAGE(c0, c1);
3848
3849							*(unsigned int*)(source0 + 4 * x) = c0;
3850						}
3851
3852						source0 += pitch;
3853						source1 += pitch;
3854					}
3855				}
3856				else if(internal.depth == 4)
3857				{
3858					for(int y = 0; y < height; y++)
3859					{
3860						for(int x = 0; x < width; x++)
3861						{
3862							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3863							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3864							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3865							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3866
3867							c0 = AVERAGE(c0, c1);
3868							c2 = AVERAGE(c2, c3);
3869							c0 = AVERAGE(c0, c2);
3870
3871							*(unsigned int*)(source0 + 4 * x) = c0;
3872						}
3873
3874						source0 += pitch;
3875						source1 += pitch;
3876						source2 += pitch;
3877						source3 += pitch;
3878					}
3879				}
3880				else if(internal.depth == 8)
3881				{
3882					for(int y = 0; y < height; y++)
3883					{
3884						for(int x = 0; x < width; x++)
3885						{
3886							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3887							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3888							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3889							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3890							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3891							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3892							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3893							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3894
3895							c0 = AVERAGE(c0, c1);
3896							c2 = AVERAGE(c2, c3);
3897							c4 = AVERAGE(c4, c5);
3898							c6 = AVERAGE(c6, c7);
3899							c0 = AVERAGE(c0, c2);
3900							c4 = AVERAGE(c4, c6);
3901							c0 = AVERAGE(c0, c4);
3902
3903							*(unsigned int*)(source0 + 4 * x) = c0;
3904						}
3905
3906						source0 += pitch;
3907						source1 += pitch;
3908						source2 += pitch;
3909						source3 += pitch;
3910						source4 += pitch;
3911						source5 += pitch;
3912						source6 += pitch;
3913						source7 += pitch;
3914					}
3915				}
3916				else if(internal.depth == 16)
3917				{
3918					for(int y = 0; y < height; y++)
3919					{
3920						for(int x = 0; x < width; x++)
3921						{
3922							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3923							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3924							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3925							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3926							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3927							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3928							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3929							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3930							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
3931							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
3932							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
3933							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
3934							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
3935							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
3936							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
3937							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
3938
3939							c0 = AVERAGE(c0, c1);
3940							c2 = AVERAGE(c2, c3);
3941							c4 = AVERAGE(c4, c5);
3942							c6 = AVERAGE(c6, c7);
3943							c8 = AVERAGE(c8, c9);
3944							cA = AVERAGE(cA, cB);
3945							cC = AVERAGE(cC, cD);
3946							cE = AVERAGE(cE, cF);
3947							c0 = AVERAGE(c0, c2);
3948							c4 = AVERAGE(c4, c6);
3949							c8 = AVERAGE(c8, cA);
3950							cC = AVERAGE(cC, cE);
3951							c0 = AVERAGE(c0, c4);
3952							c8 = AVERAGE(c8, cC);
3953							c0 = AVERAGE(c0, c8);
3954
3955							*(unsigned int*)(source0 + 4 * x) = c0;
3956						}
3957
3958						source0 += pitch;
3959						source1 += pitch;
3960						source2 += pitch;
3961						source3 += pitch;
3962						source4 += pitch;
3963						source5 += pitch;
3964						source6 += pitch;
3965						source7 += pitch;
3966						source8 += pitch;
3967						source9 += pitch;
3968						sourceA += pitch;
3969						sourceB += pitch;
3970						sourceC += pitch;
3971						sourceD += pitch;
3972						sourceE += pitch;
3973						sourceF += pitch;
3974					}
3975				}
3976				else ASSERT(false);
3977
3978				#undef AVERAGE
3979			}
3980		}
3981		else if(internal.format == FORMAT_G16R16)
3982		{
3983			if(CPUID::supportsSSE2() && (width % 4) == 0)
3984			{
3985				if(internal.depth == 2)
3986				{
3987					for(int y = 0; y < height; y++)
3988					{
3989						for(int x = 0; x < width; x += 4)
3990						{
3991							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3992							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3993
3994							c0 = _mm_avg_epu16(c0, c1);
3995
3996							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3997						}
3998
3999						source0 += pitch;
4000						source1 += pitch;
4001					}
4002				}
4003				else if(internal.depth == 4)
4004				{
4005					for(int y = 0; y < height; y++)
4006					{
4007						for(int x = 0; x < width; x += 4)
4008						{
4009							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4010							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4011							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4012							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4013
4014							c0 = _mm_avg_epu16(c0, c1);
4015							c2 = _mm_avg_epu16(c2, c3);
4016							c0 = _mm_avg_epu16(c0, c2);
4017
4018							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4019						}
4020
4021						source0 += pitch;
4022						source1 += pitch;
4023						source2 += pitch;
4024						source3 += pitch;
4025					}
4026				}
4027				else if(internal.depth == 8)
4028				{
4029					for(int y = 0; y < height; y++)
4030					{
4031						for(int x = 0; x < width; x += 4)
4032						{
4033							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4034							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4035							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4036							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4037							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4038							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4039							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4040							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4041
4042							c0 = _mm_avg_epu16(c0, c1);
4043							c2 = _mm_avg_epu16(c2, c3);
4044							c4 = _mm_avg_epu16(c4, c5);
4045							c6 = _mm_avg_epu16(c6, c7);
4046							c0 = _mm_avg_epu16(c0, c2);
4047							c4 = _mm_avg_epu16(c4, c6);
4048							c0 = _mm_avg_epu16(c0, c4);
4049
4050							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4051						}
4052
4053						source0 += pitch;
4054						source1 += pitch;
4055						source2 += pitch;
4056						source3 += pitch;
4057						source4 += pitch;
4058						source5 += pitch;
4059						source6 += pitch;
4060						source7 += pitch;
4061					}
4062				}
4063				else if(internal.depth == 16)
4064				{
4065					for(int y = 0; y < height; y++)
4066					{
4067						for(int x = 0; x < width; x += 4)
4068						{
4069							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4070							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4071							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4072							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4073							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4074							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4075							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4076							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4077							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
4078							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
4079							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
4080							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
4081							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
4082							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
4083							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
4084							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
4085
4086							c0 = _mm_avg_epu16(c0, c1);
4087							c2 = _mm_avg_epu16(c2, c3);
4088							c4 = _mm_avg_epu16(c4, c5);
4089							c6 = _mm_avg_epu16(c6, c7);
4090							c8 = _mm_avg_epu16(c8, c9);
4091							cA = _mm_avg_epu16(cA, cB);
4092							cC = _mm_avg_epu16(cC, cD);
4093							cE = _mm_avg_epu16(cE, cF);
4094							c0 = _mm_avg_epu16(c0, c2);
4095							c4 = _mm_avg_epu16(c4, c6);
4096							c8 = _mm_avg_epu16(c8, cA);
4097							cC = _mm_avg_epu16(cC, cE);
4098							c0 = _mm_avg_epu16(c0, c4);
4099							c8 = _mm_avg_epu16(c8, cC);
4100							c0 = _mm_avg_epu16(c0, c8);
4101
4102							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4103						}
4104
4105						source0 += pitch;
4106						source1 += pitch;
4107						source2 += pitch;
4108						source3 += pitch;
4109						source4 += pitch;
4110						source5 += pitch;
4111						source6 += pitch;
4112						source7 += pitch;
4113						source8 += pitch;
4114						source9 += pitch;
4115						sourceA += pitch;
4116						sourceB += pitch;
4117						sourceC += pitch;
4118						sourceD += pitch;
4119						sourceE += pitch;
4120						sourceF += pitch;
4121					}
4122				}
4123				else ASSERT(false);
4124			}
4125			else
4126			{
4127				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
4128
4129				if(internal.depth == 2)
4130				{
4131					for(int y = 0; y < height; y++)
4132					{
4133						for(int x = 0; x < width; x++)
4134						{
4135							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4136							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4137
4138							c0 = AVERAGE(c0, c1);
4139
4140							*(unsigned int*)(source0 + 4 * x) = c0;
4141						}
4142
4143						source0 += pitch;
4144						source1 += pitch;
4145					}
4146				}
4147				else if(internal.depth == 4)
4148				{
4149					for(int y = 0; y < height; y++)
4150					{
4151						for(int x = 0; x < width; x++)
4152						{
4153							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4154							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4155							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4156							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4157
4158							c0 = AVERAGE(c0, c1);
4159							c2 = AVERAGE(c2, c3);
4160							c0 = AVERAGE(c0, c2);
4161
4162							*(unsigned int*)(source0 + 4 * x) = c0;
4163						}
4164
4165						source0 += pitch;
4166						source1 += pitch;
4167						source2 += pitch;
4168						source3 += pitch;
4169					}
4170				}
4171				else if(internal.depth == 8)
4172				{
4173					for(int y = 0; y < height; y++)
4174					{
4175						for(int x = 0; x < width; x++)
4176						{
4177							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4178							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4179							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4180							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4181							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4182							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4183							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4184							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4185
4186							c0 = AVERAGE(c0, c1);
4187							c2 = AVERAGE(c2, c3);
4188							c4 = AVERAGE(c4, c5);
4189							c6 = AVERAGE(c6, c7);
4190							c0 = AVERAGE(c0, c2);
4191							c4 = AVERAGE(c4, c6);
4192							c0 = AVERAGE(c0, c4);
4193
4194							*(unsigned int*)(source0 + 4 * x) = c0;
4195						}
4196
4197						source0 += pitch;
4198						source1 += pitch;
4199						source2 += pitch;
4200						source3 += pitch;
4201						source4 += pitch;
4202						source5 += pitch;
4203						source6 += pitch;
4204						source7 += pitch;
4205					}
4206				}
4207				else if(internal.depth == 16)
4208				{
4209					for(int y = 0; y < height; y++)
4210					{
4211						for(int x = 0; x < width; x++)
4212						{
4213							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4214							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4215							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4216							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4217							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4218							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4219							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4220							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4221							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4222							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4223							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4224							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4225							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4226							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4227							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4228							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4229
4230							c0 = AVERAGE(c0, c1);
4231							c2 = AVERAGE(c2, c3);
4232							c4 = AVERAGE(c4, c5);
4233							c6 = AVERAGE(c6, c7);
4234							c8 = AVERAGE(c8, c9);
4235							cA = AVERAGE(cA, cB);
4236							cC = AVERAGE(cC, cD);
4237							cE = AVERAGE(cE, cF);
4238							c0 = AVERAGE(c0, c2);
4239							c4 = AVERAGE(c4, c6);
4240							c8 = AVERAGE(c8, cA);
4241							cC = AVERAGE(cC, cE);
4242							c0 = AVERAGE(c0, c4);
4243							c8 = AVERAGE(c8, cC);
4244							c0 = AVERAGE(c0, c8);
4245
4246							*(unsigned int*)(source0 + 4 * x) = c0;
4247						}
4248
4249						source0 += pitch;
4250						source1 += pitch;
4251						source2 += pitch;
4252						source3 += pitch;
4253						source4 += pitch;
4254						source5 += pitch;
4255						source6 += pitch;
4256						source7 += pitch;
4257						source8 += pitch;
4258						source9 += pitch;
4259						sourceA += pitch;
4260						sourceB += pitch;
4261						sourceC += pitch;
4262						sourceD += pitch;
4263						sourceE += pitch;
4264						sourceF += pitch;
4265					}
4266				}
4267				else ASSERT(false);
4268
4269				#undef AVERAGE
4270			}
4271		}
4272		else if(internal.format == FORMAT_A16B16G16R16)
4273		{
4274			if(CPUID::supportsSSE2() && (width % 2) == 0)
4275			{
4276				if(internal.depth == 2)
4277				{
4278					for(int y = 0; y < height; y++)
4279					{
4280						for(int x = 0; x < width; x += 2)
4281						{
4282							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4283							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4284
4285							c0 = _mm_avg_epu16(c0, c1);
4286
4287							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4288						}
4289
4290						source0 += pitch;
4291						source1 += pitch;
4292					}
4293				}
4294				else if(internal.depth == 4)
4295				{
4296					for(int y = 0; y < height; y++)
4297					{
4298						for(int x = 0; x < width; x += 2)
4299						{
4300							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4301							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4302							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4303							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4304
4305							c0 = _mm_avg_epu16(c0, c1);
4306							c2 = _mm_avg_epu16(c2, c3);
4307							c0 = _mm_avg_epu16(c0, c2);
4308
4309							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4310						}
4311
4312						source0 += pitch;
4313						source1 += pitch;
4314						source2 += pitch;
4315						source3 += pitch;
4316					}
4317				}
4318				else if(internal.depth == 8)
4319				{
4320					for(int y = 0; y < height; y++)
4321					{
4322						for(int x = 0; x < width; x += 2)
4323						{
4324							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4325							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4326							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4327							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4328							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4329							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4330							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4331							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4332
4333							c0 = _mm_avg_epu16(c0, c1);
4334							c2 = _mm_avg_epu16(c2, c3);
4335							c4 = _mm_avg_epu16(c4, c5);
4336							c6 = _mm_avg_epu16(c6, c7);
4337							c0 = _mm_avg_epu16(c0, c2);
4338							c4 = _mm_avg_epu16(c4, c6);
4339							c0 = _mm_avg_epu16(c0, c4);
4340
4341							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4342						}
4343
4344						source0 += pitch;
4345						source1 += pitch;
4346						source2 += pitch;
4347						source3 += pitch;
4348						source4 += pitch;
4349						source5 += pitch;
4350						source6 += pitch;
4351						source7 += pitch;
4352					}
4353				}
4354				else if(internal.depth == 16)
4355				{
4356					for(int y = 0; y < height; y++)
4357					{
4358						for(int x = 0; x < width; x += 2)
4359						{
4360							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4361							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4362							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4363							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4364							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4365							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4366							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4367							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4368							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
4369							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
4370							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
4371							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
4372							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
4373							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
4374							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
4375							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
4376
4377							c0 = _mm_avg_epu16(c0, c1);
4378							c2 = _mm_avg_epu16(c2, c3);
4379							c4 = _mm_avg_epu16(c4, c5);
4380							c6 = _mm_avg_epu16(c6, c7);
4381							c8 = _mm_avg_epu16(c8, c9);
4382							cA = _mm_avg_epu16(cA, cB);
4383							cC = _mm_avg_epu16(cC, cD);
4384							cE = _mm_avg_epu16(cE, cF);
4385							c0 = _mm_avg_epu16(c0, c2);
4386							c4 = _mm_avg_epu16(c4, c6);
4387							c8 = _mm_avg_epu16(c8, cA);
4388							cC = _mm_avg_epu16(cC, cE);
4389							c0 = _mm_avg_epu16(c0, c4);
4390							c8 = _mm_avg_epu16(c8, cC);
4391							c0 = _mm_avg_epu16(c0, c8);
4392
4393							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4394						}
4395
4396						source0 += pitch;
4397						source1 += pitch;
4398						source2 += pitch;
4399						source3 += pitch;
4400						source4 += pitch;
4401						source5 += pitch;
4402						source6 += pitch;
4403						source7 += pitch;
4404						source8 += pitch;
4405						source9 += pitch;
4406						sourceA += pitch;
4407						sourceB += pitch;
4408						sourceC += pitch;
4409						sourceD += pitch;
4410						sourceE += pitch;
4411						sourceF += pitch;
4412					}
4413				}
4414				else ASSERT(false);
4415			}
4416			else
4417			{
4418				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
4419
4420				if(internal.depth == 2)
4421				{
4422					for(int y = 0; y < height; y++)
4423					{
4424						for(int x = 0; x < 2 * width; x++)
4425						{
4426							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4427							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4428
4429							c0 = AVERAGE(c0, c1);
4430
4431							*(unsigned int*)(source0 + 4 * x) = c0;
4432						}
4433
4434						source0 += pitch;
4435						source1 += pitch;
4436					}
4437				}
4438				else if(internal.depth == 4)
4439				{
4440					for(int y = 0; y < height; y++)
4441					{
4442						for(int x = 0; x < 2 * width; x++)
4443						{
4444							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4445							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4446							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4447							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4448
4449							c0 = AVERAGE(c0, c1);
4450							c2 = AVERAGE(c2, c3);
4451							c0 = AVERAGE(c0, c2);
4452
4453							*(unsigned int*)(source0 + 4 * x) = c0;
4454						}
4455
4456						source0 += pitch;
4457						source1 += pitch;
4458						source2 += pitch;
4459						source3 += pitch;
4460					}
4461				}
4462				else if(internal.depth == 8)
4463				{
4464					for(int y = 0; y < height; y++)
4465					{
4466						for(int x = 0; x < 2 * width; x++)
4467						{
4468							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4469							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4470							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4471							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4472							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4473							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4474							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4475							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4476
4477							c0 = AVERAGE(c0, c1);
4478							c2 = AVERAGE(c2, c3);
4479							c4 = AVERAGE(c4, c5);
4480							c6 = AVERAGE(c6, c7);
4481							c0 = AVERAGE(c0, c2);
4482							c4 = AVERAGE(c4, c6);
4483							c0 = AVERAGE(c0, c4);
4484
4485							*(unsigned int*)(source0 + 4 * x) = c0;
4486						}
4487
4488						source0 += pitch;
4489						source1 += pitch;
4490						source2 += pitch;
4491						source3 += pitch;
4492						source4 += pitch;
4493						source5 += pitch;
4494						source6 += pitch;
4495						source7 += pitch;
4496					}
4497				}
4498				else if(internal.depth == 16)
4499				{
4500					for(int y = 0; y < height; y++)
4501					{
4502						for(int x = 0; x < 2 * width; x++)
4503						{
4504							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4505							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4506							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4507							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4508							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4509							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4510							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4511							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4512							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4513							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4514							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4515							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4516							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4517							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4518							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4519							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4520
4521							c0 = AVERAGE(c0, c1);
4522							c2 = AVERAGE(c2, c3);
4523							c4 = AVERAGE(c4, c5);
4524							c6 = AVERAGE(c6, c7);
4525							c8 = AVERAGE(c8, c9);
4526							cA = AVERAGE(cA, cB);
4527							cC = AVERAGE(cC, cD);
4528							cE = AVERAGE(cE, cF);
4529							c0 = AVERAGE(c0, c2);
4530							c4 = AVERAGE(c4, c6);
4531							c8 = AVERAGE(c8, cA);
4532							cC = AVERAGE(cC, cE);
4533							c0 = AVERAGE(c0, c4);
4534							c8 = AVERAGE(c8, cC);
4535							c0 = AVERAGE(c0, c8);
4536
4537							*(unsigned int*)(source0 + 4 * x) = c0;
4538						}
4539
4540						source0 += pitch;
4541						source1 += pitch;
4542						source2 += pitch;
4543						source3 += pitch;
4544						source4 += pitch;
4545						source5 += pitch;
4546						source6 += pitch;
4547						source7 += pitch;
4548						source8 += pitch;
4549						source9 += pitch;
4550						sourceA += pitch;
4551						sourceB += pitch;
4552						sourceC += pitch;
4553						sourceD += pitch;
4554						sourceE += pitch;
4555						sourceF += pitch;
4556					}
4557				}
4558				else ASSERT(false);
4559
4560				#undef AVERAGE
4561			}
4562		}
4563		else if(internal.format == FORMAT_R32F)
4564		{
4565			if(CPUID::supportsSSE() && (width % 4) == 0)
4566			{
4567				if(internal.depth == 2)
4568				{
4569					for(int y = 0; y < height; y++)
4570					{
4571						for(int x = 0; x < width; x += 4)
4572						{
4573							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4574							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4575
4576							c0 = _mm_add_ps(c0, c1);
4577							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
4578
4579							_mm_store_ps((float*)(source0 + 4 * x), c0);
4580						}
4581
4582						source0 += pitch;
4583						source1 += pitch;
4584					}
4585				}
4586				else if(internal.depth == 4)
4587				{
4588					for(int y = 0; y < height; y++)
4589					{
4590						for(int x = 0; x < width; x += 4)
4591						{
4592							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4593							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4594							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4595							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4596
4597							c0 = _mm_add_ps(c0, c1);
4598							c2 = _mm_add_ps(c2, c3);
4599							c0 = _mm_add_ps(c0, c2);
4600							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
4601
4602							_mm_store_ps((float*)(source0 + 4 * x), c0);
4603						}
4604
4605						source0 += pitch;
4606						source1 += pitch;
4607						source2 += pitch;
4608						source3 += pitch;
4609					}
4610				}
4611				else if(internal.depth == 8)
4612				{
4613					for(int y = 0; y < height; y++)
4614					{
4615						for(int x = 0; x < width; x += 4)
4616						{
4617							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4618							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4619							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4620							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4621							__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
4622							__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
4623							__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
4624							__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
4625
4626							c0 = _mm_add_ps(c0, c1);
4627							c2 = _mm_add_ps(c2, c3);
4628							c4 = _mm_add_ps(c4, c5);
4629							c6 = _mm_add_ps(c6, c7);
4630							c0 = _mm_add_ps(c0, c2);
4631							c4 = _mm_add_ps(c4, c6);
4632							c0 = _mm_add_ps(c0, c4);
4633							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
4634
4635							_mm_store_ps((float*)(source0 + 4 * x), c0);
4636						}
4637
4638						source0 += pitch;
4639						source1 += pitch;
4640						source2 += pitch;
4641						source3 += pitch;
4642						source4 += pitch;
4643						source5 += pitch;
4644						source6 += pitch;
4645						source7 += pitch;
4646					}
4647				}
4648				else if(internal.depth == 16)
4649				{
4650					for(int y = 0; y < height; y++)
4651					{
4652						for(int x = 0; x < width; x += 4)
4653						{
4654							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4655							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4656							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4657							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4658							__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
4659							__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
4660							__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
4661							__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
4662							__m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
4663							__m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
4664							__m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
4665							__m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
4666							__m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
4667							__m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
4668							__m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
4669							__m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
4670
4671							c0 = _mm_add_ps(c0, c1);
4672							c2 = _mm_add_ps(c2, c3);
4673							c4 = _mm_add_ps(c4, c5);
4674							c6 = _mm_add_ps(c6, c7);
4675							c8 = _mm_add_ps(c8, c9);
4676							cA = _mm_add_ps(cA, cB);
4677							cC = _mm_add_ps(cC, cD);
4678							cE = _mm_add_ps(cE, cF);
4679							c0 = _mm_add_ps(c0, c2);
4680							c4 = _mm_add_ps(c4, c6);
4681							c8 = _mm_add_ps(c8, cA);
4682							cC = _mm_add_ps(cC, cE);
4683							c0 = _mm_add_ps(c0, c4);
4684							c8 = _mm_add_ps(c8, cC);
4685							c0 = _mm_add_ps(c0, c8);
4686							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
4687
4688							_mm_store_ps((float*)(source0 + 4 * x), c0);
4689						}
4690
4691						source0 += pitch;
4692						source1 += pitch;
4693						source2 += pitch;
4694						source3 += pitch;
4695						source4 += pitch;
4696						source5 += pitch;
4697						source6 += pitch;
4698						source7 += pitch;
4699						source8 += pitch;
4700						source9 += pitch;
4701						sourceA += pitch;
4702						sourceB += pitch;
4703						sourceC += pitch;
4704						sourceD += pitch;
4705						sourceE += pitch;
4706						sourceF += pitch;
4707					}
4708				}
4709				else ASSERT(false);
4710			}
4711			else
4712			{
4713				if(internal.depth == 2)
4714				{
4715					for(int y = 0; y < height; y++)
4716					{
4717						for(int x = 0; x < width; x++)
4718						{
4719							float c0 = *(float*)(source0 + 4 * x);
4720							float c1 = *(float*)(source1 + 4 * x);
4721
4722							c0 = c0 + c1;
4723							c0 *= 1.0f / 2.0f;
4724
4725							*(float*)(source0 + 4 * x) = c0;
4726						}
4727
4728						source0 += pitch;
4729						source1 += pitch;
4730					}
4731				}
4732				else if(internal.depth == 4)
4733				{
4734					for(int y = 0; y < height; y++)
4735					{
4736						for(int x = 0; x < width; x++)
4737						{
4738							float c0 = *(float*)(source0 + 4 * x);
4739							float c1 = *(float*)(source1 + 4 * x);
4740							float c2 = *(float*)(source2 + 4 * x);
4741							float c3 = *(float*)(source3 + 4 * x);
4742
4743							c0 = c0 + c1;
4744							c2 = c2 + c3;
4745							c0 = c0 + c2;
4746							c0 *= 1.0f / 4.0f;
4747
4748							*(float*)(source0 + 4 * x) = c0;
4749						}
4750
4751						source0 += pitch;
4752						source1 += pitch;
4753						source2 += pitch;
4754						source3 += pitch;
4755					}
4756				}
4757				else if(internal.depth == 8)
4758				{
4759					for(int y = 0; y < height; y++)
4760					{
4761						for(int x = 0; x < width; x++)
4762						{
4763							float c0 = *(float*)(source0 + 4 * x);
4764							float c1 = *(float*)(source1 + 4 * x);
4765							float c2 = *(float*)(source2 + 4 * x);
4766							float c3 = *(float*)(source3 + 4 * x);
4767							float c4 = *(float*)(source4 + 4 * x);
4768							float c5 = *(float*)(source5 + 4 * x);
4769							float c6 = *(float*)(source6 + 4 * x);
4770							float c7 = *(float*)(source7 + 4 * x);
4771
4772							c0 = c0 + c1;
4773							c2 = c2 + c3;
4774							c4 = c4 + c5;
4775							c6 = c6 + c7;
4776							c0 = c0 + c2;
4777							c4 = c4 + c6;
4778							c0 = c0 + c4;
4779							c0 *= 1.0f / 8.0f;
4780
4781							*(float*)(source0 + 4 * x) = c0;
4782						}
4783
4784						source0 += pitch;
4785						source1 += pitch;
4786						source2 += pitch;
4787						source3 += pitch;
4788						source4 += pitch;
4789						source5 += pitch;
4790						source6 += pitch;
4791						source7 += pitch;
4792					}
4793				}
4794				else if(internal.depth == 16)
4795				{
4796					for(int y = 0; y < height; y++)
4797					{
4798						for(int x = 0; x < width; x++)
4799						{
4800							float c0 = *(float*)(source0 + 4 * x);
4801							float c1 = *(float*)(source1 + 4 * x);
4802							float c2 = *(float*)(source2 + 4 * x);
4803							float c3 = *(float*)(source3 + 4 * x);
4804							float c4 = *(float*)(source4 + 4 * x);
4805							float c5 = *(float*)(source5 + 4 * x);
4806							float c6 = *(float*)(source6 + 4 * x);
4807							float c7 = *(float*)(source7 + 4 * x);
4808							float c8 = *(float*)(source8 + 4 * x);
4809							float c9 = *(float*)(source9 + 4 * x);
4810							float cA = *(float*)(sourceA + 4 * x);
4811							float cB = *(float*)(sourceB + 4 * x);
4812							float cC = *(float*)(sourceC + 4 * x);
4813							float cD = *(float*)(sourceD + 4 * x);
4814							float cE = *(float*)(sourceE + 4 * x);
4815							float cF = *(float*)(sourceF + 4 * x);
4816
4817							c0 = c0 + c1;
4818							c2 = c2 + c3;
4819							c4 = c4 + c5;
4820							c6 = c6 + c7;
4821							c8 = c8 + c9;
4822							cA = cA + cB;
4823							cC = cC + cD;
4824							cE = cE + cF;
4825							c0 = c0 + c2;
4826							c4 = c4 + c6;
4827							c8 = c8 + cA;
4828							cC = cC + cE;
4829							c0 = c0 + c4;
4830							c8 = c8 + cC;
4831							c0 = c0 + c8;
4832							c0 *= 1.0f / 16.0f;
4833
4834							*(float*)(source0 + 4 * x) = c0;
4835						}
4836
4837						source0 += pitch;
4838						source1 += pitch;
4839						source2 += pitch;
4840						source3 += pitch;
4841						source4 += pitch;
4842						source5 += pitch;
4843						source6 += pitch;
4844						source7 += pitch;
4845						source8 += pitch;
4846						source9 += pitch;
4847						sourceA += pitch;
4848						sourceB += pitch;
4849						sourceC += pitch;
4850						sourceD += pitch;
4851						sourceE += pitch;
4852						sourceF += pitch;
4853					}
4854				}
4855				else ASSERT(false);
4856			}
4857		}
4858		else if(internal.format == FORMAT_G32R32F)
4859		{
4860			if(CPUID::supportsSSE() && (width % 2) == 0)
4861			{
4862				if(internal.depth == 2)
4863				{
4864					for(int y = 0; y < height; y++)
4865					{
4866						for(int x = 0; x < width; x += 2)
4867						{
4868							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4869							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4870
4871							c0 = _mm_add_ps(c0, c1);
4872							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
4873
4874							_mm_store_ps((float*)(source0 + 8 * x), c0);
4875						}
4876
4877						source0 += pitch;
4878						source1 += pitch;
4879					}
4880				}
4881				else if(internal.depth == 4)
4882				{
4883					for(int y = 0; y < height; y++)
4884					{
4885						for(int x = 0; x < width; x += 2)
4886						{
4887							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4888							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4889							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4890							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4891
4892							c0 = _mm_add_ps(c0, c1);
4893							c2 = _mm_add_ps(c2, c3);
4894							c0 = _mm_add_ps(c0, c2);
4895							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
4896
4897							_mm_store_ps((float*)(source0 + 8 * x), c0);
4898						}
4899
4900						source0 += pitch;
4901						source1 += pitch;
4902						source2 += pitch;
4903						source3 += pitch;
4904					}
4905				}
4906				else if(internal.depth == 8)
4907				{
4908					for(int y = 0; y < height; y++)
4909					{
4910						for(int x = 0; x < width; x += 2)
4911						{
4912							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4913							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4914							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4915							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4916							__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
4917							__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
4918							__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
4919							__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
4920
4921							c0 = _mm_add_ps(c0, c1);
4922							c2 = _mm_add_ps(c2, c3);
4923							c4 = _mm_add_ps(c4, c5);
4924							c6 = _mm_add_ps(c6, c7);
4925							c0 = _mm_add_ps(c0, c2);
4926							c4 = _mm_add_ps(c4, c6);
4927							c0 = _mm_add_ps(c0, c4);
4928							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
4929
4930							_mm_store_ps((float*)(source0 + 8 * x), c0);
4931						}
4932
4933						source0 += pitch;
4934						source1 += pitch;
4935						source2 += pitch;
4936						source3 += pitch;
4937						source4 += pitch;
4938						source5 += pitch;
4939						source6 += pitch;
4940						source7 += pitch;
4941					}
4942				}
4943				else if(internal.depth == 16)
4944				{
4945					for(int y = 0; y < height; y++)
4946					{
4947						for(int x = 0; x < width; x += 2)
4948						{
4949							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4950							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4951							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4952							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4953							__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
4954							__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
4955							__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
4956							__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
4957							__m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
4958							__m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
4959							__m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
4960							__m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
4961							__m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
4962							__m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
4963							__m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
4964							__m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
4965
4966							c0 = _mm_add_ps(c0, c1);
4967							c2 = _mm_add_ps(c2, c3);
4968							c4 = _mm_add_ps(c4, c5);
4969							c6 = _mm_add_ps(c6, c7);
4970							c8 = _mm_add_ps(c8, c9);
4971							cA = _mm_add_ps(cA, cB);
4972							cC = _mm_add_ps(cC, cD);
4973							cE = _mm_add_ps(cE, cF);
4974							c0 = _mm_add_ps(c0, c2);
4975							c4 = _mm_add_ps(c4, c6);
4976							c8 = _mm_add_ps(c8, cA);
4977							cC = _mm_add_ps(cC, cE);
4978							c0 = _mm_add_ps(c0, c4);
4979							c8 = _mm_add_ps(c8, cC);
4980							c0 = _mm_add_ps(c0, c8);
4981							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
4982
4983							_mm_store_ps((float*)(source0 + 8 * x), c0);
4984						}
4985
4986						source0 += pitch;
4987						source1 += pitch;
4988						source2 += pitch;
4989						source3 += pitch;
4990						source4 += pitch;
4991						source5 += pitch;
4992						source6 += pitch;
4993						source7 += pitch;
4994						source8 += pitch;
4995						source9 += pitch;
4996						sourceA += pitch;
4997						sourceB += pitch;
4998						sourceC += pitch;
4999						sourceD += pitch;
5000						sourceE += pitch;
5001						sourceF += pitch;
5002					}
5003				}
5004				else ASSERT(false);
5005			}
5006			else
5007			{
5008				if(internal.depth == 2)
5009				{
5010					for(int y = 0; y < height; y++)
5011					{
5012						for(int x = 0; x < 2 * width; x++)
5013						{
5014							float c0 = *(float*)(source0 + 4 * x);
5015							float c1 = *(float*)(source1 + 4 * x);
5016
5017							c0 = c0 + c1;
5018							c0 *= 1.0f / 2.0f;
5019
5020							*(float*)(source0 + 4 * x) = c0;
5021						}
5022
5023						source0 += pitch;
5024						source1 += pitch;
5025					}
5026				}
5027				else if(internal.depth == 4)
5028				{
5029					for(int y = 0; y < height; y++)
5030					{
5031						for(int x = 0; x < 2 * width; x++)
5032						{
5033							float c0 = *(float*)(source0 + 4 * x);
5034							float c1 = *(float*)(source1 + 4 * x);
5035							float c2 = *(float*)(source2 + 4 * x);
5036							float c3 = *(float*)(source3 + 4 * x);
5037
5038							c0 = c0 + c1;
5039							c2 = c2 + c3;
5040							c0 = c0 + c2;
5041							c0 *= 1.0f / 4.0f;
5042
5043							*(float*)(source0 + 4 * x) = c0;
5044						}
5045
5046						source0 += pitch;
5047						source1 += pitch;
5048						source2 += pitch;
5049						source3 += pitch;
5050					}
5051				}
5052				else if(internal.depth == 8)
5053				{
5054					for(int y = 0; y < height; y++)
5055					{
5056						for(int x = 0; x < 2 * width; x++)
5057						{
5058							float c0 = *(float*)(source0 + 4 * x);
5059							float c1 = *(float*)(source1 + 4 * x);
5060							float c2 = *(float*)(source2 + 4 * x);
5061							float c3 = *(float*)(source3 + 4 * x);
5062							float c4 = *(float*)(source4 + 4 * x);
5063							float c5 = *(float*)(source5 + 4 * x);
5064							float c6 = *(float*)(source6 + 4 * x);
5065							float c7 = *(float*)(source7 + 4 * x);
5066
5067							c0 = c0 + c1;
5068							c2 = c2 + c3;
5069							c4 = c4 + c5;
5070							c6 = c6 + c7;
5071							c0 = c0 + c2;
5072							c4 = c4 + c6;
5073							c0 = c0 + c4;
5074							c0 *= 1.0f / 8.0f;
5075
5076							*(float*)(source0 + 4 * x) = c0;
5077						}
5078
5079						source0 += pitch;
5080						source1 += pitch;
5081						source2 += pitch;
5082						source3 += pitch;
5083						source4 += pitch;
5084						source5 += pitch;
5085						source6 += pitch;
5086						source7 += pitch;
5087					}
5088				}
5089				else if(internal.depth == 16)
5090				{
5091					for(int y = 0; y < height; y++)
5092					{
5093						for(int x = 0; x < 2 * width; x++)
5094						{
5095							float c0 = *(float*)(source0 + 4 * x);
5096							float c1 = *(float*)(source1 + 4 * x);
5097							float c2 = *(float*)(source2 + 4 * x);
5098							float c3 = *(float*)(source3 + 4 * x);
5099							float c4 = *(float*)(source4 + 4 * x);
5100							float c5 = *(float*)(source5 + 4 * x);
5101							float c6 = *(float*)(source6 + 4 * x);
5102							float c7 = *(float*)(source7 + 4 * x);
5103							float c8 = *(float*)(source8 + 4 * x);
5104							float c9 = *(float*)(source9 + 4 * x);
5105							float cA = *(float*)(sourceA + 4 * x);
5106							float cB = *(float*)(sourceB + 4 * x);
5107							float cC = *(float*)(sourceC + 4 * x);
5108							float cD = *(float*)(sourceD + 4 * x);
5109							float cE = *(float*)(sourceE + 4 * x);
5110							float cF = *(float*)(sourceF + 4 * x);
5111
5112							c0 = c0 + c1;
5113							c2 = c2 + c3;
5114							c4 = c4 + c5;
5115							c6 = c6 + c7;
5116							c8 = c8 + c9;
5117							cA = cA + cB;
5118							cC = cC + cD;
5119							cE = cE + cF;
5120							c0 = c0 + c2;
5121							c4 = c4 + c6;
5122							c8 = c8 + cA;
5123							cC = cC + cE;
5124							c0 = c0 + c4;
5125							c8 = c8 + cC;
5126							c0 = c0 + c8;
5127							c0 *= 1.0f / 16.0f;
5128
5129							*(float*)(source0 + 4 * x) = c0;
5130						}
5131
5132						source0 += pitch;
5133						source1 += pitch;
5134						source2 += pitch;
5135						source3 += pitch;
5136						source4 += pitch;
5137						source5 += pitch;
5138						source6 += pitch;
5139						source7 += pitch;
5140						source8 += pitch;
5141						source9 += pitch;
5142						sourceA += pitch;
5143						sourceB += pitch;
5144						sourceC += pitch;
5145						sourceD += pitch;
5146						sourceE += pitch;
5147						sourceF += pitch;
5148					}
5149				}
5150				else ASSERT(false);
5151			}
5152		}
5153		else if(internal.format == FORMAT_A32B32G32R32F || internal.format == FORMAT_X32B32G32R32F)
5154		{
5155			if(CPUID::supportsSSE())
5156			{
5157				if(internal.depth == 2)
5158				{
5159					for(int y = 0; y < height; y++)
5160					{
5161						for(int x = 0; x < width; x++)
5162						{
5163							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5164							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5165
5166							c0 = _mm_add_ps(c0, c1);
5167							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
5168
5169							_mm_store_ps((float*)(source0 + 16 * x), c0);
5170						}
5171
5172						source0 += pitch;
5173						source1 += pitch;
5174					}
5175				}
5176				else if(internal.depth == 4)
5177				{
5178					for(int y = 0; y < height; y++)
5179					{
5180						for(int x = 0; x < width; x++)
5181						{
5182							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5183							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5184							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5185							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5186
5187							c0 = _mm_add_ps(c0, c1);
5188							c2 = _mm_add_ps(c2, c3);
5189							c0 = _mm_add_ps(c0, c2);
5190							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
5191
5192							_mm_store_ps((float*)(source0 + 16 * x), c0);
5193						}
5194
5195						source0 += pitch;
5196						source1 += pitch;
5197						source2 += pitch;
5198						source3 += pitch;
5199					}
5200				}
5201				else if(internal.depth == 8)
5202				{
5203					for(int y = 0; y < height; y++)
5204					{
5205						for(int x = 0; x < width; x++)
5206						{
5207							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5208							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5209							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5210							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5211							__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
5212							__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
5213							__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
5214							__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
5215
5216							c0 = _mm_add_ps(c0, c1);
5217							c2 = _mm_add_ps(c2, c3);
5218							c4 = _mm_add_ps(c4, c5);
5219							c6 = _mm_add_ps(c6, c7);
5220							c0 = _mm_add_ps(c0, c2);
5221							c4 = _mm_add_ps(c4, c6);
5222							c0 = _mm_add_ps(c0, c4);
5223							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
5224
5225							_mm_store_ps((float*)(source0 + 16 * x), c0);
5226						}
5227
5228						source0 += pitch;
5229						source1 += pitch;
5230						source2 += pitch;
5231						source3 += pitch;
5232						source4 += pitch;
5233						source5 += pitch;
5234						source6 += pitch;
5235						source7 += pitch;
5236					}
5237				}
5238				else if(internal.depth == 16)
5239				{
5240					for(int y = 0; y < height; y++)
5241					{
5242						for(int x = 0; x < width; x++)
5243						{
5244							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5245							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5246							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5247							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5248							__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
5249							__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
5250							__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
5251							__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
5252							__m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
5253							__m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
5254							__m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
5255							__m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
5256							__m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
5257							__m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
5258							__m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
5259							__m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
5260
5261							c0 = _mm_add_ps(c0, c1);
5262							c2 = _mm_add_ps(c2, c3);
5263							c4 = _mm_add_ps(c4, c5);
5264							c6 = _mm_add_ps(c6, c7);
5265							c8 = _mm_add_ps(c8, c9);
5266							cA = _mm_add_ps(cA, cB);
5267							cC = _mm_add_ps(cC, cD);
5268							cE = _mm_add_ps(cE, cF);
5269							c0 = _mm_add_ps(c0, c2);
5270							c4 = _mm_add_ps(c4, c6);
5271							c8 = _mm_add_ps(c8, cA);
5272							cC = _mm_add_ps(cC, cE);
5273							c0 = _mm_add_ps(c0, c4);
5274							c8 = _mm_add_ps(c8, cC);
5275							c0 = _mm_add_ps(c0, c8);
5276							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5277
5278							_mm_store_ps((float*)(source0 + 16 * x), c0);
5279						}
5280
5281						source0 += pitch;
5282						source1 += pitch;
5283						source2 += pitch;
5284						source3 += pitch;
5285						source4 += pitch;
5286						source5 += pitch;
5287						source6 += pitch;
5288						source7 += pitch;
5289						source8 += pitch;
5290						source9 += pitch;
5291						sourceA += pitch;
5292						sourceB += pitch;
5293						sourceC += pitch;
5294						sourceD += pitch;
5295						sourceE += pitch;
5296						sourceF += pitch;
5297					}
5298				}
5299				else ASSERT(false);
5300			}
5301			else
5302			{
5303				if(internal.depth == 2)
5304				{
5305					for(int y = 0; y < height; y++)
5306					{
5307						for(int x = 0; x < 4 * width; x++)
5308						{
5309							float c0 = *(float*)(source0 + 4 * x);
5310							float c1 = *(float*)(source1 + 4 * x);
5311
5312							c0 = c0 + c1;
5313							c0 *= 1.0f / 2.0f;
5314
5315							*(float*)(source0 + 4 * x) = c0;
5316						}
5317
5318						source0 += pitch;
5319						source1 += pitch;
5320					}
5321				}
5322				else if(internal.depth == 4)
5323				{
5324					for(int y = 0; y < height; y++)
5325					{
5326						for(int x = 0; x < 4 * width; x++)
5327						{
5328							float c0 = *(float*)(source0 + 4 * x);
5329							float c1 = *(float*)(source1 + 4 * x);
5330							float c2 = *(float*)(source2 + 4 * x);
5331							float c3 = *(float*)(source3 + 4 * x);
5332
5333							c0 = c0 + c1;
5334							c2 = c2 + c3;
5335							c0 = c0 + c2;
5336							c0 *= 1.0f / 4.0f;
5337
5338							*(float*)(source0 + 4 * x) = c0;
5339						}
5340
5341						source0 += pitch;
5342						source1 += pitch;
5343						source2 += pitch;
5344						source3 += pitch;
5345					}
5346				}
5347				else if(internal.depth == 8)
5348				{
5349					for(int y = 0; y < height; y++)
5350					{
5351						for(int x = 0; x < 4 * width; x++)
5352						{
5353							float c0 = *(float*)(source0 + 4 * x);
5354							float c1 = *(float*)(source1 + 4 * x);
5355							float c2 = *(float*)(source2 + 4 * x);
5356							float c3 = *(float*)(source3 + 4 * x);
5357							float c4 = *(float*)(source4 + 4 * x);
5358							float c5 = *(float*)(source5 + 4 * x);
5359							float c6 = *(float*)(source6 + 4 * x);
5360							float c7 = *(float*)(source7 + 4 * x);
5361
5362							c0 = c0 + c1;
5363							c2 = c2 + c3;
5364							c4 = c4 + c5;
5365							c6 = c6 + c7;
5366							c0 = c0 + c2;
5367							c4 = c4 + c6;
5368							c0 = c0 + c4;
5369							c0 *= 1.0f / 8.0f;
5370
5371							*(float*)(source0 + 4 * x) = c0;
5372						}
5373
5374						source0 += pitch;
5375						source1 += pitch;
5376						source2 += pitch;
5377						source3 += pitch;
5378						source4 += pitch;
5379						source5 += pitch;
5380						source6 += pitch;
5381						source7 += pitch;
5382					}
5383				}
5384				else if(internal.depth == 16)
5385				{
5386					for(int y = 0; y < height; y++)
5387					{
5388						for(int x = 0; x < 4 * width; x++)
5389						{
5390							float c0 = *(float*)(source0 + 4 * x);
5391							float c1 = *(float*)(source1 + 4 * x);
5392							float c2 = *(float*)(source2 + 4 * x);
5393							float c3 = *(float*)(source3 + 4 * x);
5394							float c4 = *(float*)(source4 + 4 * x);
5395							float c5 = *(float*)(source5 + 4 * x);
5396							float c6 = *(float*)(source6 + 4 * x);
5397							float c7 = *(float*)(source7 + 4 * x);
5398							float c8 = *(float*)(source8 + 4 * x);
5399							float c9 = *(float*)(source9 + 4 * x);
5400							float cA = *(float*)(sourceA + 4 * x);
5401							float cB = *(float*)(sourceB + 4 * x);
5402							float cC = *(float*)(sourceC + 4 * x);
5403							float cD = *(float*)(sourceD + 4 * x);
5404							float cE = *(float*)(sourceE + 4 * x);
5405							float cF = *(float*)(sourceF + 4 * x);
5406
5407							c0 = c0 + c1;
5408							c2 = c2 + c3;
5409							c4 = c4 + c5;
5410							c6 = c6 + c7;
5411							c8 = c8 + c9;
5412							cA = cA + cB;
5413							cC = cC + cD;
5414							cE = cE + cF;
5415							c0 = c0 + c2;
5416							c4 = c4 + c6;
5417							c8 = c8 + cA;
5418							cC = cC + cE;
5419							c0 = c0 + c4;
5420							c8 = c8 + cC;
5421							c0 = c0 + c8;
5422							c0 *= 1.0f / 16.0f;
5423
5424							*(float*)(source0 + 4 * x) = c0;
5425						}
5426
5427						source0 += pitch;
5428						source1 += pitch;
5429						source2 += pitch;
5430						source3 += pitch;
5431						source4 += pitch;
5432						source5 += pitch;
5433						source6 += pitch;
5434						source7 += pitch;
5435						source8 += pitch;
5436						source9 += pitch;
5437						sourceA += pitch;
5438						sourceB += pitch;
5439						sourceC += pitch;
5440						sourceD += pitch;
5441						sourceE += pitch;
5442						sourceF += pitch;
5443					}
5444				}
5445				else ASSERT(false);
5446			}
5447		}
5448		else if(internal.format == FORMAT_R5G6B5)
5449		{
5450			if(CPUID::supportsSSE2() && (width % 8) == 0)
5451			{
5452				if(internal.depth == 2)
5453				{
5454					for(int y = 0; y < height; y++)
5455					{
5456						for(int x = 0; x < width; x += 8)
5457						{
5458							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5459							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5460
5461							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5462							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5463							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5464							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5465							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5466							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5467
5468							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5469							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5470							c1 = _mm_avg_epu16(c0__g_, c1__g_);
5471							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5472							c0 = _mm_or_si128(c0, c1);
5473
5474							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5475						}
5476
5477						source0 += pitch;
5478						source1 += pitch;
5479					}
5480				}
5481				else if(internal.depth == 4)
5482				{
5483					for(int y = 0; y < height; y++)
5484					{
5485						for(int x = 0; x < width; x += 8)
5486						{
5487							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5488							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5489							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5490							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5491
5492							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5493							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5494							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5495							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5496							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5497							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5498							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5499							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5500							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5501							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5502
5503							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5504							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5505							c0 = _mm_avg_epu8(c0, c2);
5506							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5507							c1 = _mm_avg_epu16(c0__g_, c1__g_);
5508							c3 = _mm_avg_epu16(c2__g_, c3__g_);
5509							c1 = _mm_avg_epu16(c1, c3);
5510							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5511							c0 = _mm_or_si128(c0, c1);
5512
5513							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5514						}
5515
5516						source0 += pitch;
5517						source1 += pitch;
5518						source2 += pitch;
5519						source3 += pitch;
5520					}
5521				}
5522				else if(internal.depth == 8)
5523				{
5524					for(int y = 0; y < height; y++)
5525					{
5526						for(int x = 0; x < width; x += 8)
5527						{
5528							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5529							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5530							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5531							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5532							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
5533							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
5534							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
5535							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
5536
5537							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5538							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5539							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5540							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5541							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5542							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5543							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5544							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5545							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5546							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5547							__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
5548							__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
5549							__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
5550							__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
5551							__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
5552							__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
5553							__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
5554							__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
5555
5556							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5557							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5558							c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
5559							c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
5560							c0 = _mm_avg_epu8(c0, c2);
5561							c4 = _mm_avg_epu8(c4, c6);
5562							c0 = _mm_avg_epu8(c0, c4);
5563							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5564							c1 = _mm_avg_epu16(c0__g_, c1__g_);
5565							c3 = _mm_avg_epu16(c2__g_, c3__g_);
5566							c5 = _mm_avg_epu16(c4__g_, c5__g_);
5567							c7 = _mm_avg_epu16(c6__g_, c7__g_);
5568							c1 = _mm_avg_epu16(c1, c3);
5569							c5 = _mm_avg_epu16(c5, c7);
5570							c1 = _mm_avg_epu16(c1, c5);
5571							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5572							c0 = _mm_or_si128(c0, c1);
5573
5574							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5575						}
5576
5577						source0 += pitch;
5578						source1 += pitch;
5579						source2 += pitch;
5580						source3 += pitch;
5581						source4 += pitch;
5582						source5 += pitch;
5583						source6 += pitch;
5584						source7 += pitch;
5585					}
5586				}
5587				else if(internal.depth == 16)
5588				{
5589					for(int y = 0; y < height; y++)
5590					{
5591						for(int x = 0; x < width; x += 8)
5592						{
5593							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5594							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5595							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5596							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5597							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
5598							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
5599							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
5600							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
5601							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 2 * x));
5602							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 2 * x));
5603							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 2 * x));
5604							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 2 * x));
5605							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 2 * x));
5606							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 2 * x));
5607							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 2 * x));
5608							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 2 * x));
5609
5610							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5611							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5612							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5613							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5614							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5615							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5616							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5617							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5618							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5619							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5620							__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
5621							__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
5622							__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
5623							__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
5624							__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
5625							__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
5626							__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
5627							__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
5628							__m128i c8_r_b = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(r_b));
5629							__m128i c8__g_ = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(_g_));
5630							__m128i c9_r_b = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(r_b));
5631							__m128i c9__g_ = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(_g_));
5632							__m128i cA_r_b = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(r_b));
5633							__m128i cA__g_ = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(_g_));
5634							__m128i cB_r_b = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(r_b));
5635							__m128i cB__g_ = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(_g_));
5636							__m128i cC_r_b = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(r_b));
5637							__m128i cC__g_ = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(_g_));
5638							__m128i cD_r_b = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(r_b));
5639							__m128i cD__g_ = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(_g_));
5640							__m128i cE_r_b = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(r_b));
5641							__m128i cE__g_ = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(_g_));
5642							__m128i cF_r_b = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(r_b));
5643							__m128i cF__g_ = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(_g_));
5644
5645							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5646							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5647							c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
5648							c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
5649							c8 = _mm_avg_epu8(c8_r_b, c9_r_b);
5650							cA = _mm_avg_epu8(cA_r_b, cB_r_b);
5651							cC = _mm_avg_epu8(cC_r_b, cD_r_b);
5652							cE = _mm_avg_epu8(cE_r_b, cF_r_b);
5653							c0 = _mm_avg_epu8(c0, c2);
5654							c4 = _mm_avg_epu8(c4, c6);
5655							c8 = _mm_avg_epu8(c8, cA);
5656							cC = _mm_avg_epu8(cC, cE);
5657							c0 = _mm_avg_epu8(c0, c4);
5658							c8 = _mm_avg_epu8(c8, cC);
5659							c0 = _mm_avg_epu8(c0, c8);
5660							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5661							c1 = _mm_avg_epu16(c0__g_, c1__g_);
5662							c3 = _mm_avg_epu16(c2__g_, c3__g_);
5663							c5 = _mm_avg_epu16(c4__g_, c5__g_);
5664							c7 = _mm_avg_epu16(c6__g_, c7__g_);
5665							c9 = _mm_avg_epu16(c8__g_, c9__g_);
5666							cB = _mm_avg_epu16(cA__g_, cB__g_);
5667							cD = _mm_avg_epu16(cC__g_, cD__g_);
5668							cF = _mm_avg_epu16(cE__g_, cF__g_);
5669							c1 = _mm_avg_epu8(c1, c3);
5670							c5 = _mm_avg_epu8(c5, c7);
5671							c9 = _mm_avg_epu8(c9, cB);
5672							cD = _mm_avg_epu8(cD, cF);
5673							c1 = _mm_avg_epu8(c1, c5);
5674							c9 = _mm_avg_epu8(c9, cD);
5675							c1 = _mm_avg_epu8(c1, c9);
5676							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5677							c0 = _mm_or_si128(c0, c1);
5678
5679							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5680						}
5681
5682						source0 += pitch;
5683						source1 += pitch;
5684						source2 += pitch;
5685						source3 += pitch;
5686						source4 += pitch;
5687						source5 += pitch;
5688						source6 += pitch;
5689						source7 += pitch;
5690						source8 += pitch;
5691						source9 += pitch;
5692						sourceA += pitch;
5693						sourceB += pitch;
5694						sourceC += pitch;
5695						sourceD += pitch;
5696						sourceE += pitch;
5697						sourceF += pitch;
5698					}
5699				}
5700				else ASSERT(false);
5701			}
5702			else
5703			{
5704				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7BEF) + (((x) ^ (y)) & 0x0821))
5705
5706				if(internal.depth == 2)
5707				{
5708					for(int y = 0; y < height; y++)
5709					{
5710						for(int x = 0; x < width; x++)
5711						{
5712							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5713							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5714
5715							c0 = AVERAGE(c0, c1);
5716
5717							*(unsigned short*)(source0 + 2 * x) = c0;
5718						}
5719
5720						source0 += pitch;
5721						source1 += pitch;
5722					}
5723				}
5724				else if(internal.depth == 4)
5725				{
5726					for(int y = 0; y < height; y++)
5727					{
5728						for(int x = 0; x < width; x++)
5729						{
5730							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5731							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5732							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
5733							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
5734
5735							c0 = AVERAGE(c0, c1);
5736							c2 = AVERAGE(c2, c3);
5737							c0 = AVERAGE(c0, c2);
5738
5739							*(unsigned short*)(source0 + 2 * x) = c0;
5740						}
5741
5742						source0 += pitch;
5743						source1 += pitch;
5744						source2 += pitch;
5745						source3 += pitch;
5746					}
5747				}
5748				else if(internal.depth == 8)
5749				{
5750					for(int y = 0; y < height; y++)
5751					{
5752						for(int x = 0; x < width; x++)
5753						{
5754							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5755							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5756							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
5757							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
5758							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
5759							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
5760							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
5761							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
5762
5763							c0 = AVERAGE(c0, c1);
5764							c2 = AVERAGE(c2, c3);
5765							c4 = AVERAGE(c4, c5);
5766							c6 = AVERAGE(c6, c7);
5767							c0 = AVERAGE(c0, c2);
5768							c4 = AVERAGE(c4, c6);
5769							c0 = AVERAGE(c0, c4);
5770
5771							*(unsigned short*)(source0 + 2 * x) = c0;
5772						}
5773
5774						source0 += pitch;
5775						source1 += pitch;
5776						source2 += pitch;
5777						source3 += pitch;
5778						source4 += pitch;
5779						source5 += pitch;
5780						source6 += pitch;
5781						source7 += pitch;
5782					}
5783				}
5784				else if(internal.depth == 16)
5785				{
5786					for(int y = 0; y < height; y++)
5787					{
5788						for(int x = 0; x < width; x++)
5789						{
5790							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5791							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5792							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
5793							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
5794							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
5795							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
5796							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
5797							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
5798							unsigned short c8 = *(unsigned short*)(source8 + 2 * x);
5799							unsigned short c9 = *(unsigned short*)(source9 + 2 * x);
5800							unsigned short cA = *(unsigned short*)(sourceA + 2 * x);
5801							unsigned short cB = *(unsigned short*)(sourceB + 2 * x);
5802							unsigned short cC = *(unsigned short*)(sourceC + 2 * x);
5803							unsigned short cD = *(unsigned short*)(sourceD + 2 * x);
5804							unsigned short cE = *(unsigned short*)(sourceE + 2 * x);
5805							unsigned short cF = *(unsigned short*)(sourceF + 2 * x);
5806
5807							c0 = AVERAGE(c0, c1);
5808							c2 = AVERAGE(c2, c3);
5809							c4 = AVERAGE(c4, c5);
5810							c6 = AVERAGE(c6, c7);
5811							c8 = AVERAGE(c8, c9);
5812							cA = AVERAGE(cA, cB);
5813							cC = AVERAGE(cC, cD);
5814							cE = AVERAGE(cE, cF);
5815							c0 = AVERAGE(c0, c2);
5816							c4 = AVERAGE(c4, c6);
5817							c8 = AVERAGE(c8, cA);
5818							cC = AVERAGE(cC, cE);
5819							c0 = AVERAGE(c0, c4);
5820							c8 = AVERAGE(c8, cC);
5821							c0 = AVERAGE(c0, c8);
5822
5823							*(unsigned short*)(source0 + 2 * x) = c0;
5824						}
5825
5826						source0 += pitch;
5827						source1 += pitch;
5828						source2 += pitch;
5829						source3 += pitch;
5830						source4 += pitch;
5831						source5 += pitch;
5832						source6 += pitch;
5833						source7 += pitch;
5834						source8 += pitch;
5835						source9 += pitch;
5836						sourceA += pitch;
5837						sourceB += pitch;
5838						sourceC += pitch;
5839						sourceD += pitch;
5840						sourceE += pitch;
5841						sourceF += pitch;
5842					}
5843				}
5844				else ASSERT(false);
5845
5846				#undef AVERAGE
5847			}
5848		}
5849		else
5850		{
5851		//	UNIMPLEMENTED();
5852		}
5853	}
5854}
5855