Surface.cpp revision ae7756e000b6eccf7f9c4a5410c3c29e16b1eb06
1// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//    http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include "Surface.hpp"
16
17#include "Color.hpp"
18#include "Context.hpp"
19#include "ETC_Decoder.hpp"
20#include "Renderer.hpp"
21#include "Common/Half.hpp"
22#include "Common/Memory.hpp"
23#include "Common/CPUID.hpp"
24#include "Common/Resource.hpp"
25#include "Common/Debug.hpp"
26#include "Reactor/Reactor.hpp"
27
28#if defined(__i386__) || defined(__x86_64__)
29	#include <xmmintrin.h>
30	#include <emmintrin.h>
31#endif
32
33#undef min
34#undef max
35
36namespace sw
37{
38	extern bool quadLayoutEnabled;
39	extern bool complementaryDepthBuffer;
40	extern TranscendentalPrecision logPrecision;
41
42	unsigned int *Surface::palette = 0;
43	unsigned int Surface::paletteID = 0;
44
45	void Surface::Buffer::write(int x, int y, int z, const Color<float> &color)
46	{
47		byte *element = (byte*)buffer + (x + border) * bytes + (y + border) * pitchB + z * samples * sliceB;
48
49		for(int i = 0; i < samples; i++)
50		{
51			write(element, color);
52			element += sliceB;
53		}
54	}
55
56	void Surface::Buffer::write(int x, int y, const Color<float> &color)
57	{
58		byte *element = (byte*)buffer + (x + border) * bytes + (y + border) * pitchB;
59
60		for(int i = 0; i < samples; i++)
61		{
62			write(element, color);
63			element += sliceB;
64		}
65	}
66
67	inline void Surface::Buffer::write(void *element, const Color<float> &color)
68	{
69		float r = color.r;
70		float g = color.g;
71		float b = color.b;
72		float a = color.a;
73
74		if(isSRGBformat(format))
75		{
76			r = linearToSRGB(r);
77			g = linearToSRGB(g);
78			b = linearToSRGB(b);
79		}
80
81		switch(format)
82		{
83		case FORMAT_A8:
84			*(unsigned char*)element = unorm<8>(a);
85			break;
86		case FORMAT_R8_SNORM:
87			*(char*)element = snorm<8>(r);
88			break;
89		case FORMAT_R8:
90			*(unsigned char*)element = unorm<8>(r);
91			break;
92		case FORMAT_R8I:
93			*(char*)element = scast<8>(r);
94			break;
95		case FORMAT_R8UI:
96			*(unsigned char*)element = ucast<8>(r);
97			break;
98		case FORMAT_R16I:
99			*(short*)element = scast<16>(r);
100			break;
101		case FORMAT_R16UI:
102			*(unsigned short*)element = ucast<16>(r);
103			break;
104		case FORMAT_R32I:
105			*(int*)element = static_cast<int>(r);
106			break;
107		case FORMAT_R32UI:
108			*(unsigned int*)element = static_cast<unsigned int>(r);
109			break;
110		case FORMAT_R3G3B2:
111			*(unsigned char*)element = (unorm<3>(r) << 5) | (unorm<3>(g) << 2) | (unorm<2>(b) << 0);
112			break;
113		case FORMAT_A8R3G3B2:
114			*(unsigned short*)element = (unorm<8>(a) << 8) | (unorm<3>(r) << 5) | (unorm<3>(g) << 2) | (unorm<2>(b) << 0);
115			break;
116		case FORMAT_X4R4G4B4:
117			*(unsigned short*)element = 0xF000 | (unorm<4>(r) << 8) | (unorm<4>(g) << 4) | (unorm<4>(b) << 0);
118			break;
119		case FORMAT_A4R4G4B4:
120			*(unsigned short*)element = (unorm<4>(a) << 12) | (unorm<4>(r) << 8) | (unorm<4>(g) << 4) | (unorm<4>(b) << 0);
121			break;
122		case FORMAT_R4G4B4A4:
123			*(unsigned short*)element = (unorm<4>(r) << 12) | (unorm<4>(g) << 8) | (unorm<4>(b) << 4) | (unorm<4>(a) << 0);
124			break;
125		case FORMAT_R5G6B5:
126			*(unsigned short*)element = (unorm<5>(r) << 11) | (unorm<6>(g) << 5) | (unorm<5>(b) << 0);
127			break;
128		case FORMAT_A1R5G5B5:
129			*(unsigned short*)element = (unorm<1>(a) << 15) | (unorm<5>(r) << 10) | (unorm<5>(g) << 5) | (unorm<5>(b) << 0);
130			break;
131		case FORMAT_R5G5B5A1:
132			*(unsigned short*)element = (unorm<5>(r) << 11) | (unorm<5>(g) << 6) | (unorm<5>(b) << 1) | (unorm<5>(a) << 0);
133			break;
134		case FORMAT_X1R5G5B5:
135			*(unsigned short*)element = 0x8000 | (unorm<5>(r) << 10) | (unorm<5>(g) << 5) | (unorm<5>(b) << 0);
136			break;
137		case FORMAT_A8R8G8B8:
138			*(unsigned int*)element = (unorm<8>(a) << 24) | (unorm<8>(r) << 16) | (unorm<8>(g) << 8) | (unorm<8>(b) << 0);
139			break;
140		case FORMAT_X8R8G8B8:
141			*(unsigned int*)element = 0xFF000000 | (unorm<8>(r) << 16) | (unorm<8>(g) << 8) | (unorm<8>(b) << 0);
142			break;
143		case FORMAT_A8B8G8R8_SNORM:
144			*(unsigned int*)element = (static_cast<unsigned int>(snorm<8>(a)) << 24) |
145			                          (static_cast<unsigned int>(snorm<8>(b)) << 16) |
146			                          (static_cast<unsigned int>(snorm<8>(g)) << 8) |
147			                          (static_cast<unsigned int>(snorm<8>(r)) << 0);
148			break;
149		case FORMAT_A8B8G8R8:
150		case FORMAT_SRGB8_A8:
151			*(unsigned int*)element = (unorm<8>(a) << 24) | (unorm<8>(b) << 16) | (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
152			break;
153		case FORMAT_A8B8G8R8I:
154			*(unsigned int*)element = (static_cast<unsigned int>(scast<8>(a)) << 24) |
155			                          (static_cast<unsigned int>(scast<8>(b)) << 16) |
156			                          (static_cast<unsigned int>(scast<8>(g)) << 8) |
157			                          (static_cast<unsigned int>(scast<8>(r)) << 0);
158			break;
159		case FORMAT_A8B8G8R8UI:
160			*(unsigned int*)element = (ucast<8>(a) << 24) | (ucast<8>(b) << 16) | (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
161			break;
162		case FORMAT_X8B8G8R8_SNORM:
163			*(unsigned int*)element = 0x7F000000 |
164			                          (static_cast<unsigned int>(snorm<8>(b)) << 16) |
165			                          (static_cast<unsigned int>(snorm<8>(g)) << 8) |
166			                          (static_cast<unsigned int>(snorm<8>(r)) << 0);
167			break;
168		case FORMAT_X8B8G8R8:
169		case FORMAT_SRGB8_X8:
170			*(unsigned int*)element = 0xFF000000 | (unorm<8>(b) << 16) | (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
171			break;
172		case FORMAT_X8B8G8R8I:
173			*(unsigned int*)element = 0x7F000000 |
174			                          (static_cast<unsigned int>(scast<8>(b)) << 16) |
175			                          (static_cast<unsigned int>(scast<8>(g)) << 8) |
176			                          (static_cast<unsigned int>(scast<8>(r)) << 0);
177		case FORMAT_X8B8G8R8UI:
178			*(unsigned int*)element = 0xFF000000 | (ucast<8>(b) << 16) | (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
179			break;
180		case FORMAT_A2R10G10B10:
181			*(unsigned int*)element = (unorm<2>(a) << 30) | (unorm<10>(r) << 20) | (unorm<10>(g) << 10) | (unorm<10>(b) << 0);
182			break;
183		case FORMAT_A2B10G10R10:
184		case FORMAT_A2B10G10R10UI:
185			*(unsigned int*)element = (unorm<2>(a) << 30) | (unorm<10>(b) << 20) | (unorm<10>(g) << 10) | (unorm<10>(r) << 0);
186			break;
187		case FORMAT_G8R8_SNORM:
188			*(unsigned short*)element = (static_cast<unsigned short>(snorm<8>(g)) << 8) |
189			                            (static_cast<unsigned short>(snorm<8>(r)) << 0);
190			break;
191		case FORMAT_G8R8:
192			*(unsigned short*)element = (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
193			break;
194		case FORMAT_G8R8I:
195			*(unsigned short*)element = (static_cast<unsigned short>(scast<8>(g)) << 8) |
196			                            (static_cast<unsigned short>(scast<8>(r)) << 0);
197			break;
198		case FORMAT_G8R8UI:
199			*(unsigned short*)element = (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
200			break;
201		case FORMAT_G16R16:
202			*(unsigned int*)element = (unorm<16>(g) << 16) | (unorm<16>(r) << 0);
203			break;
204		case FORMAT_G16R16I:
205			*(unsigned int*)element = (static_cast<unsigned int>(scast<16>(g)) << 16) |
206			                          (static_cast<unsigned int>(scast<16>(r)) << 0);
207			break;
208		case FORMAT_G16R16UI:
209			*(unsigned int*)element = (ucast<16>(g) << 16) | (ucast<16>(r) << 0);
210			break;
211		case FORMAT_G32R32I:
212		case FORMAT_G32R32UI:
213			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
214			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
215			break;
216		case FORMAT_A16B16G16R16:
217			((unsigned short*)element)[0] = unorm<16>(r);
218			((unsigned short*)element)[1] = unorm<16>(g);
219			((unsigned short*)element)[2] = unorm<16>(b);
220			((unsigned short*)element)[3] = unorm<16>(a);
221			break;
222		case FORMAT_A16B16G16R16I:
223			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(r));
224			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(g));
225			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(b));
226			((unsigned short*)element)[3] = static_cast<unsigned short>(scast<16>(a));
227			break;
228		case FORMAT_A16B16G16R16UI:
229			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(r));
230			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(g));
231			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(b));
232			((unsigned short*)element)[3] = static_cast<unsigned short>(ucast<16>(a));
233			break;
234		case FORMAT_X16B16G16R16I:
235			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(r));
236			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(g));
237			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(b));
238			break;
239		case FORMAT_X16B16G16R16UI:
240			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(r));
241			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(g));
242			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(b));
243			break;
244		case FORMAT_A32B32G32R32I:
245		case FORMAT_A32B32G32R32UI:
246			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
247			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
248			((unsigned int*)element)[2] = static_cast<unsigned int>(b);
249			((unsigned int*)element)[3] = static_cast<unsigned int>(a);
250			break;
251		case FORMAT_X32B32G32R32I:
252		case FORMAT_X32B32G32R32UI:
253			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
254			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
255			((unsigned int*)element)[2] = static_cast<unsigned int>(b);
256			break;
257		case FORMAT_V8U8:
258			*(unsigned short*)element = (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
259			break;
260		case FORMAT_L6V5U5:
261			*(unsigned short*)element = (unorm<6>(b) << 10) | (snorm<5>(g) << 5) | (snorm<5>(r) << 0);
262			break;
263		case FORMAT_Q8W8V8U8:
264			*(unsigned int*)element = (snorm<8>(a) << 24) | (snorm<8>(b) << 16) | (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
265			break;
266		case FORMAT_X8L8V8U8:
267			*(unsigned int*)element = 0xFF000000 | (unorm<8>(b) << 16) | (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
268			break;
269		case FORMAT_V16U16:
270			*(unsigned int*)element = (snorm<16>(g) << 16) | (snorm<16>(r) << 0);
271			break;
272		case FORMAT_A2W10V10U10:
273			*(unsigned int*)element = (unorm<2>(a) << 30) | (snorm<10>(b) << 20) | (snorm<10>(g) << 10) | (snorm<10>(r) << 0);
274			break;
275		case FORMAT_A16W16V16U16:
276			((unsigned short*)element)[0] = snorm<16>(r);
277			((unsigned short*)element)[1] = snorm<16>(g);
278			((unsigned short*)element)[2] = snorm<16>(b);
279			((unsigned short*)element)[3] = unorm<16>(a);
280			break;
281		case FORMAT_Q16W16V16U16:
282			((unsigned short*)element)[0] = snorm<16>(r);
283			((unsigned short*)element)[1] = snorm<16>(g);
284			((unsigned short*)element)[2] = snorm<16>(b);
285			((unsigned short*)element)[3] = snorm<16>(a);
286			break;
287		case FORMAT_R8G8B8:
288			((unsigned char*)element)[0] = unorm<8>(b);
289			((unsigned char*)element)[1] = unorm<8>(g);
290			((unsigned char*)element)[2] = unorm<8>(r);
291			break;
292		case FORMAT_B8G8R8:
293			((unsigned char*)element)[0] = unorm<8>(r);
294			((unsigned char*)element)[1] = unorm<8>(g);
295			((unsigned char*)element)[2] = unorm<8>(b);
296			break;
297		case FORMAT_R16F:
298			*(half*)element = (half)r;
299			break;
300		case FORMAT_A16F:
301			*(half*)element = (half)a;
302			break;
303		case FORMAT_G16R16F:
304			((half*)element)[0] = (half)r;
305			((half*)element)[1] = (half)g;
306			break;
307		case FORMAT_X16B16G16R16F_UNSIGNED:
308			r = max(r, 0.0f); g = max(g, 0.0f); b = max(b, 0.0f);
309			// Fall through to FORMAT_X16B16G16R16F.
310		case FORMAT_X16B16G16R16F:
311			((half*)element)[3] = 1.0f;
312			// Fall through to FORMAT_B16G16R16F.
313		case FORMAT_B16G16R16F:
314			((half*)element)[0] = (half)r;
315			((half*)element)[1] = (half)g;
316			((half*)element)[2] = (half)b;
317			break;
318		case FORMAT_A16B16G16R16F:
319			((half*)element)[0] = (half)r;
320			((half*)element)[1] = (half)g;
321			((half*)element)[2] = (half)b;
322			((half*)element)[3] = (half)a;
323			break;
324		case FORMAT_A32F:
325			*(float*)element = a;
326			break;
327		case FORMAT_R32F:
328			*(float*)element = r;
329			break;
330		case FORMAT_G32R32F:
331			((float*)element)[0] = r;
332			((float*)element)[1] = g;
333			break;
334		case FORMAT_X32B32G32R32F_UNSIGNED:
335			r = max(r, 0.0f); g = max(g, 0.0f); b = max(b, 0.0f);
336			// Fall through to FORMAT_X32B32G32R32F.
337		case FORMAT_X32B32G32R32F:
338			((float*)element)[3] = 1.0f;
339			// Fall through to FORMAT_B32G32R32F.
340		case FORMAT_B32G32R32F:
341			((float*)element)[0] = r;
342			((float*)element)[1] = g;
343			((float*)element)[2] = b;
344			break;
345		case FORMAT_A32B32G32R32F:
346			((float*)element)[0] = r;
347			((float*)element)[1] = g;
348			((float*)element)[2] = b;
349			((float*)element)[3] = a;
350			break;
351		case FORMAT_D32F:
352		case FORMAT_D32FS8:
353		case FORMAT_D32F_LOCKABLE:
354		case FORMAT_D32FS8_TEXTURE:
355		case FORMAT_D32F_SHADOW:
356		case FORMAT_D32FS8_SHADOW:
357			*((float*)element) = r;
358			break;
359		case FORMAT_D32F_COMPLEMENTARY:
360		case FORMAT_D32FS8_COMPLEMENTARY:
361			*((float*)element) = 1 - r;
362			break;
363		case FORMAT_S8:
364			*((unsigned char*)element) = unorm<8>(r);
365			break;
366		case FORMAT_L8:
367			*(unsigned char*)element = unorm<8>(r);
368			break;
369		case FORMAT_A4L4:
370			*(unsigned char*)element = (unorm<4>(a) << 4) | (unorm<4>(r) << 0);
371			break;
372		case FORMAT_L16:
373			*(unsigned short*)element = unorm<16>(r);
374			break;
375		case FORMAT_A8L8:
376			*(unsigned short*)element = (unorm<8>(a) << 8) | (unorm<8>(r) << 0);
377			break;
378		case FORMAT_L16F:
379			*(half*)element = (half)r;
380			break;
381		case FORMAT_A16L16F:
382			((half*)element)[0] = (half)r;
383			((half*)element)[1] = (half)a;
384			break;
385		case FORMAT_L32F:
386			*(float*)element = r;
387			break;
388		case FORMAT_A32L32F:
389			((float*)element)[0] = r;
390			((float*)element)[1] = a;
391			break;
392		default:
393			ASSERT(false);
394		}
395	}
396
397	Color<float> Surface::Buffer::read(int x, int y, int z) const
398	{
399		void *element = (unsigned char*)buffer + (x + border) * bytes + (y + border) * pitchB + z * samples * sliceB;
400
401		return read(element);
402	}
403
404	Color<float> Surface::Buffer::read(int x, int y) const
405	{
406		void *element = (unsigned char*)buffer + (x + border) * bytes + (y + border) * pitchB;
407
408		return read(element);
409	}
410
411	inline Color<float> Surface::Buffer::read(void *element) const
412	{
413		float r = 0.0f;
414		float g = 0.0f;
415		float b = 0.0f;
416		float a = 1.0f;
417
418		switch(format)
419		{
420		case FORMAT_P8:
421			{
422				ASSERT(palette);
423
424				unsigned int abgr = palette[*(unsigned char*)element];
425
426				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
427				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
428				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
429				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
430			}
431			break;
432		case FORMAT_A8P8:
433			{
434				ASSERT(palette);
435
436				unsigned int bgr = palette[((unsigned char*)element)[0]];
437
438				r = (bgr & 0x000000FF) * (1.0f / 0x000000FF);
439				g = (bgr & 0x0000FF00) * (1.0f / 0x0000FF00);
440				b = (bgr & 0x00FF0000) * (1.0f / 0x00FF0000);
441				a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
442			}
443			break;
444		case FORMAT_A8:
445			r = 0;
446			g = 0;
447			b = 0;
448			a = *(unsigned char*)element * (1.0f / 0xFF);
449			break;
450		case FORMAT_R8_SNORM:
451			r = max((*(signed char*)element) * (1.0f / 0x7F), -1.0f);
452			break;
453		case FORMAT_R8:
454			r = *(unsigned char*)element * (1.0f / 0xFF);
455			break;
456		case FORMAT_R8I:
457			r = *(signed char*)element;
458			break;
459		case FORMAT_R8UI:
460			r = *(unsigned char*)element;
461			break;
462		case FORMAT_R3G3B2:
463			{
464				unsigned char rgb = *(unsigned char*)element;
465
466				r = (rgb & 0xE0) * (1.0f / 0xE0);
467				g = (rgb & 0x1C) * (1.0f / 0x1C);
468				b = (rgb & 0x03) * (1.0f / 0x03);
469			}
470			break;
471		case FORMAT_A8R3G3B2:
472			{
473				unsigned short argb = *(unsigned short*)element;
474
475				a = (argb & 0xFF00) * (1.0f / 0xFF00);
476				r = (argb & 0x00E0) * (1.0f / 0x00E0);
477				g = (argb & 0x001C) * (1.0f / 0x001C);
478				b = (argb & 0x0003) * (1.0f / 0x0003);
479			}
480			break;
481		case FORMAT_X4R4G4B4:
482			{
483				unsigned short rgb = *(unsigned short*)element;
484
485				r = (rgb & 0x0F00) * (1.0f / 0x0F00);
486				g = (rgb & 0x00F0) * (1.0f / 0x00F0);
487				b = (rgb & 0x000F) * (1.0f / 0x000F);
488			}
489			break;
490		case FORMAT_A4R4G4B4:
491			{
492				unsigned short argb = *(unsigned short*)element;
493
494				a = (argb & 0xF000) * (1.0f / 0xF000);
495				r = (argb & 0x0F00) * (1.0f / 0x0F00);
496				g = (argb & 0x00F0) * (1.0f / 0x00F0);
497				b = (argb & 0x000F) * (1.0f / 0x000F);
498			}
499			break;
500		case FORMAT_R4G4B4A4:
501			{
502				unsigned short rgba = *(unsigned short*)element;
503
504				r = (rgba & 0xF000) * (1.0f / 0xF000);
505				g = (rgba & 0x0F00) * (1.0f / 0x0F00);
506				b = (rgba & 0x00F0) * (1.0f / 0x00F0);
507				a = (rgba & 0x000F) * (1.0f / 0x000F);
508			}
509			break;
510		case FORMAT_R5G6B5:
511			{
512				unsigned short rgb = *(unsigned short*)element;
513
514				r = (rgb & 0xF800) * (1.0f / 0xF800);
515				g = (rgb & 0x07E0) * (1.0f / 0x07E0);
516				b = (rgb & 0x001F) * (1.0f / 0x001F);
517			}
518			break;
519		case FORMAT_A1R5G5B5:
520			{
521				unsigned short argb = *(unsigned short*)element;
522
523				a = (argb & 0x8000) * (1.0f / 0x8000);
524				r = (argb & 0x7C00) * (1.0f / 0x7C00);
525				g = (argb & 0x03E0) * (1.0f / 0x03E0);
526				b = (argb & 0x001F) * (1.0f / 0x001F);
527			}
528			break;
529		case FORMAT_R5G5B5A1:
530			{
531				unsigned short rgba = *(unsigned short*)element;
532
533				r = (rgba & 0xF800) * (1.0f / 0xF800);
534				g = (rgba & 0x07C0) * (1.0f / 0x07C0);
535				b = (rgba & 0x003E) * (1.0f / 0x003E);
536				a = (rgba & 0x0001) * (1.0f / 0x0001);
537			}
538			break;
539		case FORMAT_X1R5G5B5:
540			{
541				unsigned short xrgb = *(unsigned short*)element;
542
543				r = (xrgb & 0x7C00) * (1.0f / 0x7C00);
544				g = (xrgb & 0x03E0) * (1.0f / 0x03E0);
545				b = (xrgb & 0x001F) * (1.0f / 0x001F);
546			}
547			break;
548		case FORMAT_A8R8G8B8:
549			{
550				unsigned int argb = *(unsigned int*)element;
551
552				a = (argb & 0xFF000000) * (1.0f / 0xFF000000);
553				r = (argb & 0x00FF0000) * (1.0f / 0x00FF0000);
554				g = (argb & 0x0000FF00) * (1.0f / 0x0000FF00);
555				b = (argb & 0x000000FF) * (1.0f / 0x000000FF);
556			}
557			break;
558		case FORMAT_X8R8G8B8:
559			{
560				unsigned int xrgb = *(unsigned int*)element;
561
562				r = (xrgb & 0x00FF0000) * (1.0f / 0x00FF0000);
563				g = (xrgb & 0x0000FF00) * (1.0f / 0x0000FF00);
564				b = (xrgb & 0x000000FF) * (1.0f / 0x000000FF);
565			}
566			break;
567		case FORMAT_A8B8G8R8_SNORM:
568			{
569				signed char* abgr = (signed char*)element;
570
571				r = max(abgr[0] * (1.0f / 0x7F), -1.0f);
572				g = max(abgr[1] * (1.0f / 0x7F), -1.0f);
573				b = max(abgr[2] * (1.0f / 0x7F), -1.0f);
574				a = max(abgr[3] * (1.0f / 0x7F), -1.0f);
575			}
576			break;
577		case FORMAT_A8B8G8R8:
578		case FORMAT_SRGB8_A8:
579			{
580				unsigned int abgr = *(unsigned int*)element;
581
582				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
583				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
584				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
585				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
586			}
587			break;
588		case FORMAT_A8B8G8R8I:
589			{
590				signed char* abgr = (signed char*)element;
591
592				r = abgr[0];
593				g = abgr[1];
594				b = abgr[2];
595				a = abgr[3];
596			}
597			break;
598		case FORMAT_A8B8G8R8UI:
599			{
600				unsigned char* abgr = (unsigned char*)element;
601
602				r = abgr[0];
603				g = abgr[1];
604				b = abgr[2];
605				a = abgr[3];
606			}
607			break;
608		case FORMAT_X8B8G8R8_SNORM:
609			{
610				signed char* bgr = (signed char*)element;
611
612				r = max(bgr[0] * (1.0f / 0x7F), -1.0f);
613				g = max(bgr[1] * (1.0f / 0x7F), -1.0f);
614				b = max(bgr[2] * (1.0f / 0x7F), -1.0f);
615			}
616			break;
617		case FORMAT_X8B8G8R8:
618		case FORMAT_SRGB8_X8:
619			{
620				unsigned int xbgr = *(unsigned int*)element;
621
622				b = (xbgr & 0x00FF0000) * (1.0f / 0x00FF0000);
623				g = (xbgr & 0x0000FF00) * (1.0f / 0x0000FF00);
624				r = (xbgr & 0x000000FF) * (1.0f / 0x000000FF);
625			}
626			break;
627		case FORMAT_X8B8G8R8I:
628			{
629				signed char* bgr = (signed char*)element;
630
631				r = bgr[0];
632				g = bgr[1];
633				b = bgr[2];
634			}
635			break;
636		case FORMAT_X8B8G8R8UI:
637			{
638				unsigned char* bgr = (unsigned char*)element;
639
640				r = bgr[0];
641				g = bgr[1];
642				b = bgr[2];
643			}
644			break;
645		case FORMAT_G8R8_SNORM:
646			{
647				signed char* gr = (signed char*)element;
648
649				r = (gr[0] & 0xFF00) * (1.0f / 0xFF00);
650				g = (gr[1] & 0x00FF) * (1.0f / 0x00FF);
651			}
652			break;
653		case FORMAT_G8R8:
654			{
655				unsigned short gr = *(unsigned short*)element;
656
657				g = (gr & 0xFF00) * (1.0f / 0xFF00);
658				r = (gr & 0x00FF) * (1.0f / 0x00FF);
659			}
660			break;
661		case FORMAT_G8R8I:
662			{
663				signed char* gr = (signed char*)element;
664
665				r = gr[0];
666				g = gr[1];
667			}
668			break;
669		case FORMAT_G8R8UI:
670			{
671				unsigned char* gr = (unsigned char*)element;
672
673				r = gr[0];
674				g = gr[1];
675			}
676			break;
677		case FORMAT_R16I:
678			r = *((short*)element);
679			break;
680		case FORMAT_R16UI:
681			r = *((unsigned short*)element);
682			break;
683		case FORMAT_G16R16I:
684			{
685				short* gr = (short*)element;
686
687				r = gr[0];
688				g = gr[1];
689			}
690			break;
691		case FORMAT_G16R16:
692			{
693				unsigned int gr = *(unsigned int*)element;
694
695				g = (gr & 0xFFFF0000) * (1.0f / 0xFFFF0000);
696				r = (gr & 0x0000FFFF) * (1.0f / 0x0000FFFF);
697			}
698			break;
699		case FORMAT_G16R16UI:
700			{
701				unsigned short* gr = (unsigned short*)element;
702
703				r = gr[0];
704				g = gr[1];
705			}
706			break;
707		case FORMAT_A2R10G10B10:
708			{
709				unsigned int argb = *(unsigned int*)element;
710
711				a = (argb & 0xC0000000) * (1.0f / 0xC0000000);
712				r = (argb & 0x3FF00000) * (1.0f / 0x3FF00000);
713				g = (argb & 0x000FFC00) * (1.0f / 0x000FFC00);
714				b = (argb & 0x000003FF) * (1.0f / 0x000003FF);
715			}
716			break;
717		case FORMAT_A2B10G10R10:
718			{
719				unsigned int abgr = *(unsigned int*)element;
720
721				a = (abgr & 0xC0000000) * (1.0f / 0xC0000000);
722				b = (abgr & 0x3FF00000) * (1.0f / 0x3FF00000);
723				g = (abgr & 0x000FFC00) * (1.0f / 0x000FFC00);
724				r = (abgr & 0x000003FF) * (1.0f / 0x000003FF);
725			}
726			break;
727		case FORMAT_A2B10G10R10UI:
728			{
729				unsigned int abgr = *(unsigned int*)element;
730
731				a = static_cast<float>((abgr & 0xC0000000) >> 30);
732				b = static_cast<float>((abgr & 0x3FF00000) >> 20);
733				g = static_cast<float>((abgr & 0x000FFC00) >> 10);
734				r = static_cast<float>(abgr & 0x000003FF);
735			}
736			break;
737		case FORMAT_A16B16G16R16I:
738			{
739				short* abgr = (short*)element;
740
741				r = abgr[0];
742				g = abgr[1];
743				b = abgr[2];
744				a = abgr[3];
745			}
746			break;
747		case FORMAT_A16B16G16R16:
748			r = ((unsigned short*)element)[0] * (1.0f / 0xFFFF);
749			g = ((unsigned short*)element)[1] * (1.0f / 0xFFFF);
750			b = ((unsigned short*)element)[2] * (1.0f / 0xFFFF);
751			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
752			break;
753		case FORMAT_A16B16G16R16UI:
754			{
755				unsigned short* abgr = (unsigned short*)element;
756
757				r = abgr[0];
758				g = abgr[1];
759				b = abgr[2];
760				a = abgr[3];
761			}
762			break;
763		case FORMAT_X16B16G16R16I:
764			{
765				short* bgr = (short*)element;
766
767				r = bgr[0];
768				g = bgr[1];
769				b = bgr[2];
770			}
771			break;
772		case FORMAT_X16B16G16R16UI:
773			{
774				unsigned short* bgr = (unsigned short*)element;
775
776				r = bgr[0];
777				g = bgr[1];
778				b = bgr[2];
779			}
780			break;
781		case FORMAT_A32B32G32R32I:
782			{
783				int* abgr = (int*)element;
784
785				r = static_cast<float>(abgr[0]);
786				g = static_cast<float>(abgr[1]);
787				b = static_cast<float>(abgr[2]);
788				a = static_cast<float>(abgr[3]);
789			}
790			break;
791		case FORMAT_A32B32G32R32UI:
792			{
793				unsigned int* abgr = (unsigned int*)element;
794
795				r = static_cast<float>(abgr[0]);
796				g = static_cast<float>(abgr[1]);
797				b = static_cast<float>(abgr[2]);
798				a = static_cast<float>(abgr[3]);
799			}
800			break;
801		case FORMAT_X32B32G32R32I:
802			{
803				int* bgr = (int*)element;
804
805				r = static_cast<float>(bgr[0]);
806				g = static_cast<float>(bgr[1]);
807				b = static_cast<float>(bgr[2]);
808			}
809			break;
810		case FORMAT_X32B32G32R32UI:
811			{
812				unsigned int* bgr = (unsigned int*)element;
813
814				r = static_cast<float>(bgr[0]);
815				g = static_cast<float>(bgr[1]);
816				b = static_cast<float>(bgr[2]);
817			}
818			break;
819		case FORMAT_G32R32I:
820			{
821				int* gr = (int*)element;
822
823				r = static_cast<float>(gr[0]);
824				g = static_cast<float>(gr[1]);
825			}
826			break;
827		case FORMAT_G32R32UI:
828			{
829				unsigned int* gr = (unsigned int*)element;
830
831				r = static_cast<float>(gr[0]);
832				g = static_cast<float>(gr[1]);
833			}
834			break;
835		case FORMAT_R32I:
836			r = static_cast<float>(*((int*)element));
837			break;
838		case FORMAT_R32UI:
839			r = static_cast<float>(*((unsigned int*)element));
840			break;
841		case FORMAT_V8U8:
842			{
843				unsigned short vu = *(unsigned short*)element;
844
845				r = ((int)(vu & 0x00FF) << 24) * (1.0f / 0x7F000000);
846				g = ((int)(vu & 0xFF00) << 16) * (1.0f / 0x7F000000);
847			}
848			break;
849		case FORMAT_L6V5U5:
850			{
851				unsigned short lvu = *(unsigned short*)element;
852
853				r = ((int)(lvu & 0x001F) << 27) * (1.0f / 0x78000000);
854				g = ((int)(lvu & 0x03E0) << 22) * (1.0f / 0x78000000);
855				b = (lvu & 0xFC00) * (1.0f / 0xFC00);
856			}
857			break;
858		case FORMAT_Q8W8V8U8:
859			{
860				unsigned int qwvu = *(unsigned int*)element;
861
862				r = ((int)(qwvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
863				g = ((int)(qwvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
864				b = ((int)(qwvu & 0x00FF0000) << 8)  * (1.0f / 0x7F000000);
865				a = ((int)(qwvu & 0xFF000000) << 0)  * (1.0f / 0x7F000000);
866			}
867			break;
868		case FORMAT_X8L8V8U8:
869			{
870				unsigned int xlvu = *(unsigned int*)element;
871
872				r = ((int)(xlvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
873				g = ((int)(xlvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
874				b = (xlvu & 0x00FF0000) * (1.0f / 0x00FF0000);
875			}
876			break;
877		case FORMAT_R8G8B8:
878			r = ((unsigned char*)element)[2] * (1.0f / 0xFF);
879			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
880			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
881			break;
882		case FORMAT_B8G8R8:
883			r = ((unsigned char*)element)[0] * (1.0f / 0xFF);
884			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
885			b = ((unsigned char*)element)[2] * (1.0f / 0xFF);
886			break;
887		case FORMAT_V16U16:
888			{
889				unsigned int vu = *(unsigned int*)element;
890
891				r = ((int)(vu & 0x0000FFFF) << 16) * (1.0f / 0x7FFF0000);
892				g = ((int)(vu & 0xFFFF0000) << 0)  * (1.0f / 0x7FFF0000);
893			}
894			break;
895		case FORMAT_A2W10V10U10:
896			{
897				unsigned int awvu = *(unsigned int*)element;
898
899				r = ((int)(awvu & 0x000003FF) << 22) * (1.0f / 0x7FC00000);
900				g = ((int)(awvu & 0x000FFC00) << 12) * (1.0f / 0x7FC00000);
901				b = ((int)(awvu & 0x3FF00000) << 2)  * (1.0f / 0x7FC00000);
902				a = (awvu & 0xC0000000) * (1.0f / 0xC0000000);
903			}
904			break;
905		case FORMAT_A16W16V16U16:
906			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
907			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
908			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
909			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
910			break;
911		case FORMAT_Q16W16V16U16:
912			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
913			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
914			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
915			a = ((signed short*)element)[3] * (1.0f / 0x7FFF);
916			break;
917		case FORMAT_L8:
918			r =
919			g =
920			b = *(unsigned char*)element * (1.0f / 0xFF);
921			break;
922		case FORMAT_A4L4:
923			{
924				unsigned char al = *(unsigned char*)element;
925
926				r =
927				g =
928				b = (al & 0x0F) * (1.0f / 0x0F);
929				a = (al & 0xF0) * (1.0f / 0xF0);
930			}
931			break;
932		case FORMAT_L16:
933			r =
934			g =
935			b = *(unsigned short*)element * (1.0f / 0xFFFF);
936			break;
937		case FORMAT_A8L8:
938			r =
939			g =
940			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
941			a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
942			break;
943		case FORMAT_L16F:
944			r =
945			g =
946			b = *(half*)element;
947			break;
948		case FORMAT_A16L16F:
949			r =
950			g =
951			b = ((half*)element)[0];
952			a = ((half*)element)[1];
953			break;
954		case FORMAT_L32F:
955			r =
956			g =
957			b = *(float*)element;
958			break;
959		case FORMAT_A32L32F:
960			r =
961			g =
962			b = ((float*)element)[0];
963			a = ((float*)element)[1];
964			break;
965		case FORMAT_A16F:
966			a = *(half*)element;
967			break;
968		case FORMAT_R16F:
969			r = *(half*)element;
970			break;
971		case FORMAT_G16R16F:
972			r = ((half*)element)[0];
973			g = ((half*)element)[1];
974			break;
975		case FORMAT_X16B16G16R16F:
976		case FORMAT_X16B16G16R16F_UNSIGNED:
977		case FORMAT_B16G16R16F:
978			r = ((half*)element)[0];
979			g = ((half*)element)[1];
980			b = ((half*)element)[2];
981			break;
982		case FORMAT_A16B16G16R16F:
983			r = ((half*)element)[0];
984			g = ((half*)element)[1];
985			b = ((half*)element)[2];
986			a = ((half*)element)[3];
987			break;
988		case FORMAT_A32F:
989			a = *(float*)element;
990			break;
991		case FORMAT_R32F:
992			r = *(float*)element;
993			break;
994		case FORMAT_G32R32F:
995			r = ((float*)element)[0];
996			g = ((float*)element)[1];
997			break;
998		case FORMAT_X32B32G32R32F:
999		case FORMAT_X32B32G32R32F_UNSIGNED:
1000		case FORMAT_B32G32R32F:
1001			r = ((float*)element)[0];
1002			g = ((float*)element)[1];
1003			b = ((float*)element)[2];
1004			break;
1005		case FORMAT_A32B32G32R32F:
1006			r = ((float*)element)[0];
1007			g = ((float*)element)[1];
1008			b = ((float*)element)[2];
1009			a = ((float*)element)[3];
1010			break;
1011		case FORMAT_D32F:
1012		case FORMAT_D32FS8:
1013		case FORMAT_D32F_LOCKABLE:
1014		case FORMAT_D32FS8_TEXTURE:
1015		case FORMAT_D32F_SHADOW:
1016		case FORMAT_D32FS8_SHADOW:
1017			r = *(float*)element;
1018			g = r;
1019			b = r;
1020			a = r;
1021			break;
1022		case FORMAT_D32F_COMPLEMENTARY:
1023		case FORMAT_D32FS8_COMPLEMENTARY:
1024			r = 1.0f - *(float*)element;
1025			g = r;
1026			b = r;
1027			a = r;
1028			break;
1029		case FORMAT_S8:
1030			r = *(unsigned char*)element * (1.0f / 0xFF);
1031			break;
1032		default:
1033			ASSERT(false);
1034		}
1035
1036		if(isSRGBformat(format))
1037		{
1038			r = sRGBtoLinear(r);
1039			g = sRGBtoLinear(g);
1040			b = sRGBtoLinear(b);
1041		}
1042
1043		return Color<float>(r, g, b, a);
1044	}
1045
1046	Color<float> Surface::Buffer::sample(float x, float y, float z) const
1047	{
1048		x -= 0.5f;
1049		y -= 0.5f;
1050		z -= 0.5f;
1051
1052		int x0 = clamp((int)x, 0, width - 1);
1053		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
1054
1055		int y0 = clamp((int)y, 0, height - 1);
1056		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
1057
1058		int z0 = clamp((int)z, 0, depth - 1);
1059		int z1 = (z0 + 1 >= depth) ? z0 : z0 + 1;
1060
1061		Color<float> c000 = read(x0, y0, z0);
1062		Color<float> c100 = read(x1, y0, z0);
1063		Color<float> c010 = read(x0, y1, z0);
1064		Color<float> c110 = read(x1, y1, z0);
1065		Color<float> c001 = read(x0, y0, z1);
1066		Color<float> c101 = read(x1, y0, z1);
1067		Color<float> c011 = read(x0, y1, z1);
1068		Color<float> c111 = read(x1, y1, z1);
1069
1070		float fx = x - x0;
1071		float fy = y - y0;
1072		float fz = z - z0;
1073
1074		c000 *= (1 - fx) * (1 - fy) * (1 - fz);
1075		c100 *= fx * (1 - fy) * (1 - fz);
1076		c010 *= (1 - fx) * fy * (1 - fz);
1077		c110 *= fx * fy * (1 - fz);
1078		c001 *= (1 - fx) * (1 - fy) * fz;
1079		c101 *= fx * (1 - fy) * fz;
1080		c011 *= (1 - fx) * fy * fz;
1081		c111 *= fx * fy * fz;
1082
1083		return c000 + c100 + c010 + c110 + c001 + c101 + c011 + c111;
1084	}
1085
1086	Color<float> Surface::Buffer::sample(float x, float y, int layer) const
1087	{
1088		x -= 0.5f;
1089		y -= 0.5f;
1090
1091		int x0 = clamp((int)x, 0, width - 1);
1092		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
1093
1094		int y0 = clamp((int)y, 0, height - 1);
1095		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
1096
1097		Color<float> c00 = read(x0, y0, layer);
1098		Color<float> c10 = read(x1, y0, layer);
1099		Color<float> c01 = read(x0, y1, layer);
1100		Color<float> c11 = read(x1, y1, layer);
1101
1102		float fx = x - x0;
1103		float fy = y - y0;
1104
1105		c00 *= (1 - fx) * (1 - fy);
1106		c10 *= fx * (1 - fy);
1107		c01 *= (1 - fx) * fy;
1108		c11 *= fx * fy;
1109
1110		return c00 + c10 + c01 + c11;
1111	}
1112
1113	void *Surface::Buffer::lockRect(int x, int y, int z, Lock lock)
1114	{
1115		this->lock = lock;
1116
1117		switch(lock)
1118		{
1119		case LOCK_UNLOCKED:
1120		case LOCK_READONLY:
1121		case LOCK_UPDATE:
1122			break;
1123		case LOCK_WRITEONLY:
1124		case LOCK_READWRITE:
1125		case LOCK_DISCARD:
1126			dirty = true;
1127			break;
1128		default:
1129			ASSERT(false);
1130		}
1131
1132		if(buffer)
1133		{
1134			x += border;
1135			y += border;
1136
1137			switch(format)
1138			{
1139			case FORMAT_DXT1:
1140			case FORMAT_ATI1:
1141			case FORMAT_ETC1:
1142			case FORMAT_R11_EAC:
1143			case FORMAT_SIGNED_R11_EAC:
1144			case FORMAT_RGB8_ETC2:
1145			case FORMAT_SRGB8_ETC2:
1146			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1147			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1148				return (unsigned char*)buffer + 8 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1149			case FORMAT_RG11_EAC:
1150			case FORMAT_SIGNED_RG11_EAC:
1151			case FORMAT_RGBA8_ETC2_EAC:
1152			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1153			case FORMAT_RGBA_ASTC_4x4_KHR:
1154			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1155				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1156			case FORMAT_RGBA_ASTC_5x4_KHR:
1157			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1158				return (unsigned char*)buffer + 16 * (x / 5) + (y / 4) * pitchB + z * sliceB;
1159			case FORMAT_RGBA_ASTC_5x5_KHR:
1160			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1161				return (unsigned char*)buffer + 16 * (x / 5) + (y / 5) * pitchB + z * sliceB;
1162			case FORMAT_RGBA_ASTC_6x5_KHR:
1163			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1164				return (unsigned char*)buffer + 16 * (x / 6) + (y / 5) * pitchB + z * sliceB;
1165			case FORMAT_RGBA_ASTC_6x6_KHR:
1166			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1167				return (unsigned char*)buffer + 16 * (x / 6) + (y / 6) * pitchB + z * sliceB;
1168			case FORMAT_RGBA_ASTC_8x5_KHR:
1169			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1170				return (unsigned char*)buffer + 16 * (x / 8) + (y / 5) * pitchB + z * sliceB;
1171			case FORMAT_RGBA_ASTC_8x6_KHR:
1172			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1173				return (unsigned char*)buffer + 16 * (x / 8) + (y / 6) * pitchB + z * sliceB;
1174			case FORMAT_RGBA_ASTC_8x8_KHR:
1175			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1176				return (unsigned char*)buffer + 16 * (x / 8) + (y / 8) * pitchB + z * sliceB;
1177			case FORMAT_RGBA_ASTC_10x5_KHR:
1178			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1179				return (unsigned char*)buffer + 16 * (x / 10) + (y / 5) * pitchB + z * sliceB;
1180			case FORMAT_RGBA_ASTC_10x6_KHR:
1181			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1182				return (unsigned char*)buffer + 16 * (x / 10) + (y / 6) * pitchB + z * sliceB;
1183			case FORMAT_RGBA_ASTC_10x8_KHR:
1184			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1185				return (unsigned char*)buffer + 16 * (x / 10) + (y / 8) * pitchB + z * sliceB;
1186			case FORMAT_RGBA_ASTC_10x10_KHR:
1187			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1188				return (unsigned char*)buffer + 16 * (x / 10) + (y / 10) * pitchB + z * sliceB;
1189			case FORMAT_RGBA_ASTC_12x10_KHR:
1190			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1191				return (unsigned char*)buffer + 16 * (x / 12) + (y / 10) * pitchB + z * sliceB;
1192			case FORMAT_RGBA_ASTC_12x12_KHR:
1193			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1194				return (unsigned char*)buffer + 16 * (x / 12) + (y / 12) * pitchB + z * sliceB;
1195			case FORMAT_DXT3:
1196			case FORMAT_DXT5:
1197			case FORMAT_ATI2:
1198				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1199			default:
1200				return (unsigned char*)buffer + x * bytes + y * pitchB + z * samples * sliceB;
1201			}
1202		}
1203
1204		return nullptr;
1205	}
1206
1207	void Surface::Buffer::unlockRect()
1208	{
1209		lock = LOCK_UNLOCKED;
1210	}
1211
1212	class SurfaceImplementation : public Surface
1213	{
1214	public:
1215		SurfaceImplementation(int width, int height, int depth, Format format, void *pixels, int pitch, int slice)
1216			: Surface(width, height, depth, format, pixels, pitch, slice) {}
1217		SurfaceImplementation(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchP = 0)
1218			: Surface(texture, width, height, depth, border, samples, format, lockable, renderTarget, pitchP) {}
1219		~SurfaceImplementation() override {};
1220
1221		void *lockInternal(int x, int y, int z, Lock lock, Accessor client) override
1222		{
1223			return Surface::lockInternal(x, y, z, lock, client);
1224		}
1225
1226		void unlockInternal() override
1227		{
1228			Surface::unlockInternal();
1229		}
1230	};
1231
1232	Surface *Surface::create(int width, int height, int depth, Format format, void *pixels, int pitch, int slice)
1233	{
1234		return new SurfaceImplementation(width, height, depth, format, pixels, pitch, slice);
1235	}
1236
1237	Surface *Surface::create(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchPprovided)
1238	{
1239		return new SurfaceImplementation(texture, width, height, depth, border, samples, format, lockable, renderTarget, pitchPprovided);
1240	}
1241
1242	Surface::Surface(int width, int height, int depth, Format format, void *pixels, int pitch, int slice) : lockable(true), renderTarget(false)
1243	{
1244		resource = new Resource(0);
1245		hasParent = false;
1246		ownExternal = false;
1247		depth = max(1, depth);
1248
1249		external.buffer = pixels;
1250		external.width = width;
1251		external.height = height;
1252		external.depth = depth;
1253		external.samples = 1;
1254		external.format = format;
1255		external.bytes = bytes(external.format);
1256		external.pitchB = pitch;
1257		external.pitchP = external.bytes ? pitch / external.bytes : 0;
1258		external.sliceB = slice;
1259		external.sliceP = external.bytes ? slice / external.bytes : 0;
1260		external.border = 0;
1261		external.lock = LOCK_UNLOCKED;
1262		external.dirty = true;
1263
1264		internal.buffer = nullptr;
1265		internal.width = width;
1266		internal.height = height;
1267		internal.depth = depth;
1268		internal.samples = 1;
1269		internal.format = selectInternalFormat(format);
1270		internal.bytes = bytes(internal.format);
1271		internal.pitchB = pitchB(internal.width, 0, internal.format, false);
1272		internal.pitchP = pitchP(internal.width, 0, internal.format, false);
1273		internal.sliceB = sliceB(internal.width, internal.height, 0, internal.format, false);
1274		internal.sliceP = sliceP(internal.width, internal.height, 0, internal.format, false);
1275		internal.border = 0;
1276		internal.lock = LOCK_UNLOCKED;
1277		internal.dirty = false;
1278
1279		stencil.buffer = nullptr;
1280		stencil.width = width;
1281		stencil.height = height;
1282		stencil.depth = depth;
1283		stencil.samples = 1;
1284		stencil.format = isStencil(format) ? FORMAT_S8 : FORMAT_NULL;
1285		stencil.bytes = bytes(stencil.format);
1286		stencil.pitchB = pitchB(stencil.width, 0, stencil.format, false);
1287		stencil.pitchP = pitchP(stencil.width, 0, stencil.format, false);
1288		stencil.sliceB = sliceB(stencil.width, stencil.height, 0, stencil.format, false);
1289		stencil.sliceP = sliceP(stencil.width, stencil.height, 0, stencil.format, false);
1290		stencil.border = 0;
1291		stencil.lock = LOCK_UNLOCKED;
1292		stencil.dirty = false;
1293
1294		dirtyContents = true;
1295		paletteUsed = 0;
1296	}
1297
1298	Surface::Surface(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchPprovided) : lockable(lockable), renderTarget(renderTarget)
1299	{
1300		resource = texture ? texture : new Resource(0);
1301		hasParent = texture != nullptr;
1302		ownExternal = true;
1303		depth = max(1, depth);
1304		samples = max(1, samples);
1305
1306		external.buffer = nullptr;
1307		external.width = width;
1308		external.height = height;
1309		external.depth = depth;
1310		external.samples = (short)samples;
1311		external.format = format;
1312		external.bytes = bytes(external.format);
1313		external.pitchB = pitchB(external.width, 0, external.format, renderTarget && !texture);
1314		external.pitchP = pitchP(external.width, 0, external.format, renderTarget && !texture);
1315		external.sliceB = sliceB(external.width, external.height, 0, external.format, renderTarget && !texture);
1316		external.sliceP = sliceP(external.width, external.height, 0, external.format, renderTarget && !texture);
1317		external.border = 0;
1318		external.lock = LOCK_UNLOCKED;
1319		external.dirty = false;
1320
1321		internal.buffer = nullptr;
1322		internal.width = width;
1323		internal.height = height;
1324		internal.depth = depth;
1325		internal.samples = (short)samples;
1326		internal.format = selectInternalFormat(format);
1327		internal.bytes = bytes(internal.format);
1328		internal.pitchB = !pitchPprovided ? pitchB(internal.width, border, internal.format, renderTarget) : pitchPprovided * internal.bytes;
1329		internal.pitchP = !pitchPprovided ? pitchP(internal.width, border, internal.format, renderTarget) : pitchPprovided;
1330		internal.sliceB = sliceB(internal.width, internal.height, border, internal.format, renderTarget);
1331		internal.sliceP = sliceP(internal.width, internal.height, border, internal.format, renderTarget);
1332		internal.border = (short)border;
1333		internal.lock = LOCK_UNLOCKED;
1334		internal.dirty = false;
1335
1336		stencil.buffer = nullptr;
1337		stencil.width = width;
1338		stencil.height = height;
1339		stencil.depth = depth;
1340		stencil.samples = (short)samples;
1341		stencil.format = isStencil(format) ? FORMAT_S8 : FORMAT_NULL;
1342		stencil.bytes = bytes(stencil.format);
1343		stencil.pitchB = pitchB(stencil.width, 0, stencil.format, renderTarget);
1344		stencil.pitchP = pitchP(stencil.width, 0, stencil.format, renderTarget);
1345		stencil.sliceB = sliceB(stencil.width, stencil.height, 0, stencil.format, renderTarget);
1346		stencil.sliceP = sliceP(stencil.width, stencil.height, 0, stencil.format, renderTarget);
1347		stencil.border = 0;
1348		stencil.lock = LOCK_UNLOCKED;
1349		stencil.dirty = false;
1350
1351		dirtyContents = true;
1352		paletteUsed = 0;
1353	}
1354
1355	Surface::~Surface()
1356	{
1357		// sync() must be called before this destructor to ensure all locks have been released.
1358		// We can't call it here because the parent resource may already have been destroyed.
1359		ASSERT(isUnlocked());
1360
1361		if(!hasParent)
1362		{
1363			resource->destruct();
1364		}
1365
1366		if(ownExternal)
1367		{
1368			deallocate(external.buffer);
1369		}
1370
1371		if(internal.buffer != external.buffer)
1372		{
1373			deallocate(internal.buffer);
1374		}
1375
1376		deallocate(stencil.buffer);
1377
1378		external.buffer = 0;
1379		internal.buffer = 0;
1380		stencil.buffer = 0;
1381	}
1382
1383	void *Surface::lockExternal(int x, int y, int z, Lock lock, Accessor client)
1384	{
1385		resource->lock(client);
1386
1387		if(!external.buffer)
1388		{
1389			if(internal.buffer && identicalFormats())
1390			{
1391				external.buffer = internal.buffer;
1392			}
1393			else
1394			{
1395				external.buffer = allocateBuffer(external.width, external.height, external.depth, external.border, external.samples, external.format);
1396			}
1397		}
1398
1399		if(internal.dirty)
1400		{
1401			if(lock != LOCK_DISCARD)
1402			{
1403				update(external, internal);
1404			}
1405
1406			internal.dirty = false;
1407		}
1408
1409		switch(lock)
1410		{
1411		case LOCK_READONLY:
1412			break;
1413		case LOCK_WRITEONLY:
1414		case LOCK_READWRITE:
1415		case LOCK_DISCARD:
1416			dirtyContents = true;
1417			break;
1418		default:
1419			ASSERT(false);
1420		}
1421
1422		return external.lockRect(x, y, z, lock);
1423	}
1424
1425	void Surface::unlockExternal()
1426	{
1427		external.unlockRect();
1428
1429		resource->unlock();
1430	}
1431
1432	void *Surface::lockInternal(int x, int y, int z, Lock lock, Accessor client)
1433	{
1434		if(lock != LOCK_UNLOCKED)
1435		{
1436			resource->lock(client);
1437		}
1438
1439		if(!internal.buffer)
1440		{
1441			if(external.buffer && identicalFormats())
1442			{
1443				internal.buffer = external.buffer;
1444			}
1445			else
1446			{
1447				internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.border, internal.samples, internal.format);
1448			}
1449		}
1450
1451		// FIXME: WHQL requires conversion to lower external precision and back
1452		if(logPrecision >= WHQL)
1453		{
1454			if(internal.dirty && renderTarget && internal.format != external.format)
1455			{
1456				if(lock != LOCK_DISCARD)
1457				{
1458					switch(external.format)
1459					{
1460					case FORMAT_R3G3B2:
1461					case FORMAT_A8R3G3B2:
1462					case FORMAT_A1R5G5B5:
1463					case FORMAT_A2R10G10B10:
1464					case FORMAT_A2B10G10R10:
1465						lockExternal(0, 0, 0, LOCK_READWRITE, client);
1466						unlockExternal();
1467						break;
1468					default:
1469						// Difference passes WHQL
1470						break;
1471					}
1472				}
1473			}
1474		}
1475
1476		if(external.dirty || (isPalette(external.format) && paletteUsed != Surface::paletteID))
1477		{
1478			if(lock != LOCK_DISCARD)
1479			{
1480				update(internal, external);
1481			}
1482
1483			external.dirty = false;
1484			paletteUsed = Surface::paletteID;
1485		}
1486
1487		switch(lock)
1488		{
1489		case LOCK_UNLOCKED:
1490		case LOCK_READONLY:
1491			break;
1492		case LOCK_WRITEONLY:
1493		case LOCK_READWRITE:
1494		case LOCK_DISCARD:
1495			dirtyContents = true;
1496			break;
1497		default:
1498			ASSERT(false);
1499		}
1500
1501		if(lock == LOCK_READONLY && client == PUBLIC)
1502		{
1503			resolve();
1504		}
1505
1506		return internal.lockRect(x, y, z, lock);
1507	}
1508
1509	void Surface::unlockInternal()
1510	{
1511		internal.unlockRect();
1512
1513		resource->unlock();
1514	}
1515
1516	void *Surface::lockStencil(int x, int y, int front, Accessor client)
1517	{
1518		if(stencil.format == FORMAT_NULL)
1519		{
1520			return nullptr;
1521		}
1522
1523		resource->lock(client);
1524
1525		if(!stencil.buffer)
1526		{
1527			stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.border, stencil.samples, stencil.format);
1528		}
1529
1530		return stencil.lockRect(x, y, front, LOCK_READWRITE);   // FIXME
1531	}
1532
1533	void Surface::unlockStencil()
1534	{
1535		stencil.unlockRect();
1536
1537		resource->unlock();
1538	}
1539
1540	int Surface::bytes(Format format)
1541	{
1542		switch(format)
1543		{
1544		case FORMAT_NULL:				return 0;
1545		case FORMAT_P8:					return 1;
1546		case FORMAT_A8P8:				return 2;
1547		case FORMAT_A8:					return 1;
1548		case FORMAT_R8I:				return 1;
1549		case FORMAT_R8:					return 1;
1550		case FORMAT_R3G3B2:				return 1;
1551		case FORMAT_R16I:				return 2;
1552		case FORMAT_R16UI:				return 2;
1553		case FORMAT_A8R3G3B2:			return 2;
1554		case FORMAT_R5G6B5:				return 2;
1555		case FORMAT_A1R5G5B5:			return 2;
1556		case FORMAT_X1R5G5B5:			return 2;
1557		case FORMAT_R5G5B5A1:           return 2;
1558		case FORMAT_X4R4G4B4:			return 2;
1559		case FORMAT_A4R4G4B4:			return 2;
1560		case FORMAT_R4G4B4A4:           return 2;
1561		case FORMAT_R8G8B8:				return 3;
1562		case FORMAT_B8G8R8:             return 3;
1563		case FORMAT_R32I:				return 4;
1564		case FORMAT_R32UI:				return 4;
1565		case FORMAT_X8R8G8B8:			return 4;
1566	//	case FORMAT_X8G8R8B8Q:			return 4;
1567		case FORMAT_A8R8G8B8:			return 4;
1568	//	case FORMAT_A8G8R8B8Q:			return 4;
1569		case FORMAT_X8B8G8R8I:			return 4;
1570		case FORMAT_X8B8G8R8:			return 4;
1571		case FORMAT_SRGB8_X8:			return 4;
1572		case FORMAT_SRGB8_A8:			return 4;
1573		case FORMAT_A8B8G8R8I:			return 4;
1574		case FORMAT_R8UI:				return 1;
1575		case FORMAT_G8R8UI:				return 2;
1576		case FORMAT_X8B8G8R8UI:			return 4;
1577		case FORMAT_A8B8G8R8UI:			return 4;
1578		case FORMAT_A8B8G8R8:			return 4;
1579		case FORMAT_R8_SNORM:			return 1;
1580		case FORMAT_G8R8_SNORM:		return 2;
1581		case FORMAT_X8B8G8R8_SNORM:	return 4;
1582		case FORMAT_A8B8G8R8_SNORM:	return 4;
1583		case FORMAT_A2R10G10B10:		return 4;
1584		case FORMAT_A2B10G10R10:		return 4;
1585		case FORMAT_A2B10G10R10UI:		return 4;
1586		case FORMAT_G8R8I:				return 2;
1587		case FORMAT_G8R8:				return 2;
1588		case FORMAT_G16R16I:			return 4;
1589		case FORMAT_G16R16UI:			return 4;
1590		case FORMAT_G16R16:				return 4;
1591		case FORMAT_G32R32I:			return 8;
1592		case FORMAT_G32R32UI:			return 8;
1593		case FORMAT_X16B16G16R16I:		return 8;
1594		case FORMAT_X16B16G16R16UI:		return 8;
1595		case FORMAT_A16B16G16R16I:		return 8;
1596		case FORMAT_A16B16G16R16UI:		return 8;
1597		case FORMAT_A16B16G16R16:		return 8;
1598		case FORMAT_X32B32G32R32I:		return 16;
1599		case FORMAT_X32B32G32R32UI:		return 16;
1600		case FORMAT_A32B32G32R32I:		return 16;
1601		case FORMAT_A32B32G32R32UI:		return 16;
1602		// Compressed formats
1603		case FORMAT_DXT1:				return 2;   // Column of four pixels
1604		case FORMAT_DXT3:				return 4;   // Column of four pixels
1605		case FORMAT_DXT5:				return 4;   // Column of four pixels
1606		case FORMAT_ATI1:				return 2;   // Column of four pixels
1607		case FORMAT_ATI2:				return 4;   // Column of four pixels
1608		case FORMAT_ETC1:				return 2;   // Column of four pixels
1609		case FORMAT_R11_EAC:			return 2;
1610		case FORMAT_SIGNED_R11_EAC:		return 2;
1611		case FORMAT_RG11_EAC:			return 4;
1612		case FORMAT_SIGNED_RG11_EAC:	return 4;
1613		case FORMAT_RGB8_ETC2:			return 2;
1614		case FORMAT_SRGB8_ETC2:			return 2;
1615		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
1616		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
1617		case FORMAT_RGBA8_ETC2_EAC:			return 4;
1618		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:	return 4;
1619		case FORMAT_RGBA_ASTC_4x4_KHR:
1620		case FORMAT_RGBA_ASTC_5x4_KHR:
1621		case FORMAT_RGBA_ASTC_5x5_KHR:
1622		case FORMAT_RGBA_ASTC_6x5_KHR:
1623		case FORMAT_RGBA_ASTC_6x6_KHR:
1624		case FORMAT_RGBA_ASTC_8x5_KHR:
1625		case FORMAT_RGBA_ASTC_8x6_KHR:
1626		case FORMAT_RGBA_ASTC_8x8_KHR:
1627		case FORMAT_RGBA_ASTC_10x5_KHR:
1628		case FORMAT_RGBA_ASTC_10x6_KHR:
1629		case FORMAT_RGBA_ASTC_10x8_KHR:
1630		case FORMAT_RGBA_ASTC_10x10_KHR:
1631		case FORMAT_RGBA_ASTC_12x10_KHR:
1632		case FORMAT_RGBA_ASTC_12x12_KHR:
1633		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1634		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1635		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1636		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1637		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1638		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1639		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1640		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1641		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1642		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1643		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1644		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1645		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1646		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: return 0; // FIXME
1647		// Bumpmap formats
1648		case FORMAT_V8U8:				return 2;
1649		case FORMAT_L6V5U5:				return 2;
1650		case FORMAT_Q8W8V8U8:			return 4;
1651		case FORMAT_X8L8V8U8:			return 4;
1652		case FORMAT_A2W10V10U10:		return 4;
1653		case FORMAT_V16U16:				return 4;
1654		case FORMAT_A16W16V16U16:		return 8;
1655		case FORMAT_Q16W16V16U16:		return 8;
1656		// Luminance formats
1657		case FORMAT_L8:					return 1;
1658		case FORMAT_A4L4:				return 1;
1659		case FORMAT_L16:				return 2;
1660		case FORMAT_A8L8:				return 2;
1661		case FORMAT_L16F:               return 2;
1662		case FORMAT_A16L16F:            return 4;
1663		case FORMAT_L32F:               return 4;
1664		case FORMAT_A32L32F:            return 8;
1665		// Floating-point formats
1666		case FORMAT_A16F:				return 2;
1667		case FORMAT_R16F:				return 2;
1668		case FORMAT_G16R16F:			return 4;
1669		case FORMAT_B16G16R16F:			return 6;
1670		case FORMAT_X16B16G16R16F:		return 8;
1671		case FORMAT_A16B16G16R16F:		return 8;
1672		case FORMAT_X16B16G16R16F_UNSIGNED: return 8;
1673		case FORMAT_A32F:				return 4;
1674		case FORMAT_R32F:				return 4;
1675		case FORMAT_G32R32F:			return 8;
1676		case FORMAT_B32G32R32F:			return 12;
1677		case FORMAT_X32B32G32R32F:		return 16;
1678		case FORMAT_A32B32G32R32F:		return 16;
1679		case FORMAT_X32B32G32R32F_UNSIGNED: return 16;
1680		// Depth/stencil formats
1681		case FORMAT_D16:				return 2;
1682		case FORMAT_D32:				return 4;
1683		case FORMAT_D24X8:				return 4;
1684		case FORMAT_D24S8:				return 4;
1685		case FORMAT_D24FS8:				return 4;
1686		case FORMAT_D32F:				return 4;
1687		case FORMAT_D32FS8:				return 4;
1688		case FORMAT_D32F_COMPLEMENTARY:	return 4;
1689		case FORMAT_D32FS8_COMPLEMENTARY: return 4;
1690		case FORMAT_D32F_LOCKABLE:		return 4;
1691		case FORMAT_D32FS8_TEXTURE:		return 4;
1692		case FORMAT_D32F_SHADOW:		return 4;
1693		case FORMAT_D32FS8_SHADOW:		return 4;
1694		case FORMAT_DF24S8:				return 4;
1695		case FORMAT_DF16S8:				return 2;
1696		case FORMAT_INTZ:				return 4;
1697		case FORMAT_S8:					return 1;
1698		case FORMAT_YV12_BT601:         return 1;   // Y plane only
1699		case FORMAT_YV12_BT709:         return 1;   // Y plane only
1700		case FORMAT_YV12_JFIF:          return 1;   // Y plane only
1701		default:
1702			ASSERT(false);
1703		}
1704
1705		return 0;
1706	}
1707
1708	int Surface::pitchB(int width, int border, Format format, bool target)
1709	{
1710		width += 2 * border;
1711
1712		if(target || isDepth(format) || isStencil(format))
1713		{
1714			width = align(width, 2);
1715		}
1716
1717		switch(format)
1718		{
1719		case FORMAT_DXT1:
1720		case FORMAT_ETC1:
1721		case FORMAT_R11_EAC:
1722		case FORMAT_SIGNED_R11_EAC:
1723		case FORMAT_RGB8_ETC2:
1724		case FORMAT_SRGB8_ETC2:
1725		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1726		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1727			return 8 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per 4 rows
1728		case FORMAT_RG11_EAC:
1729		case FORMAT_SIGNED_RG11_EAC:
1730		case FORMAT_RGBA8_ETC2_EAC:
1731		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1732		case FORMAT_RGBA_ASTC_4x4_KHR:
1733		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1734			return 16 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per 4 rows
1735		case FORMAT_RGBA_ASTC_5x4_KHR:
1736		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1737		case FORMAT_RGBA_ASTC_5x5_KHR:
1738		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1739			return 16 * ((width + 4) / 5);
1740		case FORMAT_RGBA_ASTC_6x5_KHR:
1741		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1742		case FORMAT_RGBA_ASTC_6x6_KHR:
1743		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1744			return 16 * ((width + 5) / 6);
1745		case FORMAT_RGBA_ASTC_8x5_KHR:
1746		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1747		case FORMAT_RGBA_ASTC_8x6_KHR:
1748		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1749		case FORMAT_RGBA_ASTC_8x8_KHR:
1750		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1751			return 16 * ((width + 7) / 8);
1752		case FORMAT_RGBA_ASTC_10x5_KHR:
1753		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1754		case FORMAT_RGBA_ASTC_10x6_KHR:
1755		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1756		case FORMAT_RGBA_ASTC_10x8_KHR:
1757		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1758		case FORMAT_RGBA_ASTC_10x10_KHR:
1759		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1760			return 16 * ((width + 9) / 10);
1761		case FORMAT_RGBA_ASTC_12x10_KHR:
1762		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1763		case FORMAT_RGBA_ASTC_12x12_KHR:
1764		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1765			return 16 * ((width + 11) / 12);
1766		case FORMAT_DXT3:
1767		case FORMAT_DXT5:
1768			return 16 * ((width + 3) / 4);   // 128 bit per 4x4 block, computed per 4 rows
1769		case FORMAT_ATI1:
1770			return 2 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per row
1771		case FORMAT_ATI2:
1772			return 4 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per row
1773		case FORMAT_YV12_BT601:
1774		case FORMAT_YV12_BT709:
1775		case FORMAT_YV12_JFIF:
1776			return align(width, 16);
1777		default:
1778			return bytes(format) * width;
1779		}
1780	}
1781
1782	int Surface::pitchP(int width, int border, Format format, bool target)
1783	{
1784		int B = bytes(format);
1785
1786		return B > 0 ? pitchB(width, border, format, target) / B : 0;
1787	}
1788
1789	int Surface::sliceB(int width, int height, int border, Format format, bool target)
1790	{
1791		height += 2 * border;
1792
1793		if(target || isDepth(format) || isStencil(format))
1794		{
1795			height = ((height + 1) & ~1);
1796		}
1797
1798		switch(format)
1799		{
1800		case FORMAT_DXT1:
1801		case FORMAT_DXT3:
1802		case FORMAT_DXT5:
1803		case FORMAT_ETC1:
1804		case FORMAT_R11_EAC:
1805		case FORMAT_SIGNED_R11_EAC:
1806		case FORMAT_RG11_EAC:
1807		case FORMAT_SIGNED_RG11_EAC:
1808		case FORMAT_RGB8_ETC2:
1809		case FORMAT_SRGB8_ETC2:
1810		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1811		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1812		case FORMAT_RGBA8_ETC2_EAC:
1813		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1814		case FORMAT_RGBA_ASTC_4x4_KHR:
1815		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1816		case FORMAT_RGBA_ASTC_5x4_KHR:
1817		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1818			return pitchB(width, border, format, target) * ((height + 3) / 4);   // Pitch computed per 4 rows
1819		case FORMAT_RGBA_ASTC_5x5_KHR:
1820		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1821		case FORMAT_RGBA_ASTC_6x5_KHR:
1822		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1823		case FORMAT_RGBA_ASTC_8x5_KHR:
1824		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1825		case FORMAT_RGBA_ASTC_10x5_KHR:
1826		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1827			return pitchB(width, border, format, target) * ((height + 4) / 5);   // Pitch computed per 5 rows
1828		case FORMAT_RGBA_ASTC_6x6_KHR:
1829		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1830		case FORMAT_RGBA_ASTC_8x6_KHR:
1831		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1832		case FORMAT_RGBA_ASTC_10x6_KHR:
1833		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1834			return pitchB(width, border, format, target) * ((height + 5) / 6);   // Pitch computed per 6 rows
1835		case FORMAT_RGBA_ASTC_8x8_KHR:
1836		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1837		case FORMAT_RGBA_ASTC_10x8_KHR:
1838		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1839			return pitchB(width, border, format, target) * ((height + 7) / 8);   // Pitch computed per 8 rows
1840		case FORMAT_RGBA_ASTC_10x10_KHR:
1841		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1842		case FORMAT_RGBA_ASTC_12x10_KHR:
1843		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1844			return pitchB(width, border, format, target) * ((height + 9) / 10);   // Pitch computed per 10 rows
1845		case FORMAT_RGBA_ASTC_12x12_KHR:
1846		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1847			return pitchB(width, border, format, target) * ((height + 11) / 12);   // Pitch computed per 12 rows
1848		case FORMAT_ATI1:
1849		case FORMAT_ATI2:
1850		default:
1851			return pitchB(width, border, format, target) * height;   // Pitch computed per row
1852		}
1853	}
1854
1855	int Surface::sliceP(int width, int height, int border, Format format, bool target)
1856	{
1857		int B = bytes(format);
1858
1859		return B > 0 ? sliceB(width, height, border, format, target) / B : 0;
1860	}
1861
1862	void Surface::update(Buffer &destination, Buffer &source)
1863	{
1864	//	ASSERT(source.lock != LOCK_UNLOCKED);
1865	//	ASSERT(destination.lock != LOCK_UNLOCKED);
1866
1867		if(destination.buffer != source.buffer)
1868		{
1869			ASSERT(source.dirty && !destination.dirty);
1870
1871			switch(source.format)
1872			{
1873			case FORMAT_R8G8B8:		decodeR8G8B8(destination, source);		break;   // FIXME: Check destination format
1874			case FORMAT_X1R5G5B5:	decodeX1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1875			case FORMAT_A1R5G5B5:	decodeA1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1876			case FORMAT_X4R4G4B4:	decodeX4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1877			case FORMAT_A4R4G4B4:	decodeA4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1878			case FORMAT_P8:			decodeP8(destination, source);			break;   // FIXME: Check destination format
1879			case FORMAT_DXT1:		decodeDXT1(destination, source);		break;   // FIXME: Check destination format
1880			case FORMAT_DXT3:		decodeDXT3(destination, source);		break;   // FIXME: Check destination format
1881			case FORMAT_DXT5:		decodeDXT5(destination, source);		break;   // FIXME: Check destination format
1882			case FORMAT_ATI1:		decodeATI1(destination, source);		break;   // FIXME: Check destination format
1883			case FORMAT_ATI2:		decodeATI2(destination, source);		break;   // FIXME: Check destination format
1884			case FORMAT_R11_EAC:         decodeEAC(destination, source, 1, false); break; // FIXME: Check destination format
1885			case FORMAT_SIGNED_R11_EAC:  decodeEAC(destination, source, 1, true);  break; // FIXME: Check destination format
1886			case FORMAT_RG11_EAC:        decodeEAC(destination, source, 2, false); break; // FIXME: Check destination format
1887			case FORMAT_SIGNED_RG11_EAC: decodeEAC(destination, source, 2, true);  break; // FIXME: Check destination format
1888			case FORMAT_ETC1:
1889			case FORMAT_RGB8_ETC2:                      decodeETC2(destination, source, 0, false); break; // FIXME: Check destination format
1890			case FORMAT_SRGB8_ETC2:                     decodeETC2(destination, source, 0, true);  break; // FIXME: Check destination format
1891			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:  decodeETC2(destination, source, 1, false); break; // FIXME: Check destination format
1892			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2: decodeETC2(destination, source, 1, true);  break; // FIXME: Check destination format
1893			case FORMAT_RGBA8_ETC2_EAC:                 decodeETC2(destination, source, 8, false); break; // FIXME: Check destination format
1894			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:          decodeETC2(destination, source, 8, true);  break; // FIXME: Check destination format
1895			case FORMAT_RGBA_ASTC_4x4_KHR:           decodeASTC(destination, source, 4,  4,  1, false); break; // FIXME: Check destination format
1896			case FORMAT_RGBA_ASTC_5x4_KHR:           decodeASTC(destination, source, 5,  4,  1, false); break; // FIXME: Check destination format
1897			case FORMAT_RGBA_ASTC_5x5_KHR:           decodeASTC(destination, source, 5,  5,  1, false); break; // FIXME: Check destination format
1898			case FORMAT_RGBA_ASTC_6x5_KHR:           decodeASTC(destination, source, 6,  5,  1, false); break; // FIXME: Check destination format
1899			case FORMAT_RGBA_ASTC_6x6_KHR:           decodeASTC(destination, source, 6,  6,  1, false); break; // FIXME: Check destination format
1900			case FORMAT_RGBA_ASTC_8x5_KHR:           decodeASTC(destination, source, 8,  5,  1, false); break; // FIXME: Check destination format
1901			case FORMAT_RGBA_ASTC_8x6_KHR:           decodeASTC(destination, source, 8,  6,  1, false); break; // FIXME: Check destination format
1902			case FORMAT_RGBA_ASTC_8x8_KHR:           decodeASTC(destination, source, 8,  8,  1, false); break; // FIXME: Check destination format
1903			case FORMAT_RGBA_ASTC_10x5_KHR:          decodeASTC(destination, source, 10, 5,  1, false); break; // FIXME: Check destination format
1904			case FORMAT_RGBA_ASTC_10x6_KHR:          decodeASTC(destination, source, 10, 6,  1, false); break; // FIXME: Check destination format
1905			case FORMAT_RGBA_ASTC_10x8_KHR:          decodeASTC(destination, source, 10, 8,  1, false); break; // FIXME: Check destination format
1906			case FORMAT_RGBA_ASTC_10x10_KHR:         decodeASTC(destination, source, 10, 10, 1, false); break; // FIXME: Check destination format
1907			case FORMAT_RGBA_ASTC_12x10_KHR:         decodeASTC(destination, source, 12, 10, 1, false); break; // FIXME: Check destination format
1908			case FORMAT_RGBA_ASTC_12x12_KHR:         decodeASTC(destination, source, 12, 12, 1, false); break; // FIXME: Check destination format
1909			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:   decodeASTC(destination, source, 4,  4,  1, true);  break; // FIXME: Check destination format
1910			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:   decodeASTC(destination, source, 5,  4,  1, true);  break; // FIXME: Check destination format
1911			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:   decodeASTC(destination, source, 5,  5,  1, true);  break; // FIXME: Check destination format
1912			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:   decodeASTC(destination, source, 6,  5,  1, true);  break; // FIXME: Check destination format
1913			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:   decodeASTC(destination, source, 6,  6,  1, true);  break; // FIXME: Check destination format
1914			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:   decodeASTC(destination, source, 8,  5,  1, true);  break; // FIXME: Check destination format
1915			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:   decodeASTC(destination, source, 8,  6,  1, true);  break; // FIXME: Check destination format
1916			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:   decodeASTC(destination, source, 8,  8,  1, true);  break; // FIXME: Check destination format
1917			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:  decodeASTC(destination, source, 10, 5,  1, true);  break; // FIXME: Check destination format
1918			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:  decodeASTC(destination, source, 10, 6,  1, true);  break; // FIXME: Check destination format
1919			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:  decodeASTC(destination, source, 10, 8,  1, true);  break; // FIXME: Check destination format
1920			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR: decodeASTC(destination, source, 10, 10, 1, true);  break; // FIXME: Check destination format
1921			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR: decodeASTC(destination, source, 12, 10, 1, true);  break; // FIXME: Check destination format
1922			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: decodeASTC(destination, source, 12, 12, 1, true);  break; // FIXME: Check destination format
1923			default:				genericUpdate(destination, source);		break;
1924			}
1925		}
1926	}
1927
1928	void Surface::genericUpdate(Buffer &destination, Buffer &source)
1929	{
1930		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
1931		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
1932
1933		int depth = min(destination.depth, source.depth);
1934		int height = min(destination.height, source.height);
1935		int width = min(destination.width, source.width);
1936		int rowBytes = width * source.bytes;
1937
1938		for(int z = 0; z < depth; z++)
1939		{
1940			unsigned char *sourceRow = sourceSlice;
1941			unsigned char *destinationRow = destinationSlice;
1942
1943			for(int y = 0; y < height; y++)
1944			{
1945				if(source.format == destination.format)
1946				{
1947					memcpy(destinationRow, sourceRow, rowBytes);
1948				}
1949				else
1950				{
1951					unsigned char *sourceElement = sourceRow;
1952					unsigned char *destinationElement = destinationRow;
1953
1954					for(int x = 0; x < width; x++)
1955					{
1956						Color<float> color = source.read(sourceElement);
1957						destination.write(destinationElement, color);
1958
1959						sourceElement += source.bytes;
1960						destinationElement += destination.bytes;
1961					}
1962				}
1963
1964				sourceRow += source.pitchB;
1965				destinationRow += destination.pitchB;
1966			}
1967
1968			sourceSlice += source.sliceB;
1969			destinationSlice += destination.sliceB;
1970		}
1971
1972		source.unlockRect();
1973		destination.unlockRect();
1974	}
1975
1976	void Surface::decodeR8G8B8(Buffer &destination, Buffer &source)
1977	{
1978		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
1979		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
1980
1981		int depth = min(destination.depth, source.depth);
1982		int height = min(destination.height, source.height);
1983		int width = min(destination.width, source.width);
1984
1985		for(int z = 0; z < depth; z++)
1986		{
1987			unsigned char *sourceRow = sourceSlice;
1988			unsigned char *destinationRow = destinationSlice;
1989
1990			for(int y = 0; y < height; y++)
1991			{
1992				unsigned char *sourceElement = sourceRow;
1993				unsigned char *destinationElement = destinationRow;
1994
1995				for(int x = 0; x < width; x++)
1996				{
1997					unsigned int b = sourceElement[0];
1998					unsigned int g = sourceElement[1];
1999					unsigned int r = sourceElement[2];
2000
2001					*(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0);
2002
2003					sourceElement += source.bytes;
2004					destinationElement += destination.bytes;
2005				}
2006
2007				sourceRow += source.pitchB;
2008				destinationRow += destination.pitchB;
2009			}
2010
2011			sourceSlice += source.sliceB;
2012			destinationSlice += destination.sliceB;
2013		}
2014
2015		source.unlockRect();
2016		destination.unlockRect();
2017	}
2018
2019	void Surface::decodeX1R5G5B5(Buffer &destination, Buffer &source)
2020	{
2021		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
2022		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
2023
2024		int depth = min(destination.depth, source.depth);
2025		int height = min(destination.height, source.height);
2026		int width = min(destination.width, source.width);
2027
2028		for(int z = 0; z < depth; z++)
2029		{
2030			unsigned char *sourceRow = sourceSlice;
2031			unsigned char *destinationRow = destinationSlice;
2032
2033			for(int y = 0; y < height; y++)
2034			{
2035				unsigned char *sourceElement = sourceRow;
2036				unsigned char *destinationElement = destinationRow;
2037
2038				for(int x = 0; x < width; x++)
2039				{
2040					unsigned int xrgb = *(unsigned short*)sourceElement;
2041
2042					unsigned int r = (((xrgb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
2043					unsigned int g = (((xrgb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
2044					unsigned int b = (((xrgb & 0x001F) * 2106  + 0x80) >> 8);
2045
2046					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
2047
2048					sourceElement += source.bytes;
2049					destinationElement += destination.bytes;
2050				}
2051
2052				sourceRow += source.pitchB;
2053				destinationRow += destination.pitchB;
2054			}
2055
2056			sourceSlice += source.sliceB;
2057			destinationSlice += destination.sliceB;
2058		}
2059
2060		source.unlockRect();
2061		destination.unlockRect();
2062	}
2063
2064	void Surface::decodeA1R5G5B5(Buffer &destination, Buffer &source)
2065	{
2066		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
2067		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
2068
2069		int depth = min(destination.depth, source.depth);
2070		int height = min(destination.height, source.height);
2071		int width = min(destination.width, source.width);
2072
2073		for(int z = 0; z < depth; z++)
2074		{
2075			unsigned char *sourceRow = sourceSlice;
2076			unsigned char *destinationRow = destinationSlice;
2077
2078			for(int y = 0; y < height; y++)
2079			{
2080				unsigned char *sourceElement = sourceRow;
2081				unsigned char *destinationElement = destinationRow;
2082
2083				for(int x = 0; x < width; x++)
2084				{
2085					unsigned int argb = *(unsigned short*)sourceElement;
2086
2087					unsigned int a =   (argb & 0x8000) * 130560;
2088					unsigned int r = (((argb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
2089					unsigned int g = (((argb & 0x03E0) * 16846  + 0x8000) >> 8) & 0x0000FF00;
2090					unsigned int b = (((argb & 0x001F) * 2106   + 0x80) >> 8);
2091
2092					*(unsigned int*)destinationElement = a | r | g | b;
2093
2094					sourceElement += source.bytes;
2095					destinationElement += destination.bytes;
2096				}
2097
2098				sourceRow += source.pitchB;
2099				destinationRow += destination.pitchB;
2100			}
2101
2102			sourceSlice += source.sliceB;
2103			destinationSlice += destination.sliceB;
2104		}
2105
2106		source.unlockRect();
2107		destination.unlockRect();
2108	}
2109
2110	void Surface::decodeX4R4G4B4(Buffer &destination, Buffer &source)
2111	{
2112		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
2113		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
2114
2115		int depth = min(destination.depth, source.depth);
2116		int height = min(destination.height, source.height);
2117		int width = min(destination.width, source.width);
2118
2119		for(int z = 0; z < depth; z++)
2120		{
2121			unsigned char *sourceRow = sourceSlice;
2122			unsigned char *destinationRow = destinationSlice;
2123
2124			for(int y = 0; y < height; y++)
2125			{
2126				unsigned char *sourceElement = sourceRow;
2127				unsigned char *destinationElement = destinationRow;
2128
2129				for(int x = 0; x < width; x++)
2130				{
2131					unsigned int xrgb = *(unsigned short*)sourceElement;
2132
2133					unsigned int r = ((xrgb & 0x0F00) * 0x00001100) & 0x00FF0000;
2134					unsigned int g = ((xrgb & 0x00F0) * 0x00000110) & 0x0000FF00;
2135					unsigned int b =  (xrgb & 0x000F) * 0x00000011;
2136
2137					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
2138
2139					sourceElement += source.bytes;
2140					destinationElement += destination.bytes;
2141				}
2142
2143				sourceRow += source.pitchB;
2144				destinationRow += destination.pitchB;
2145			}
2146
2147			sourceSlice += source.sliceB;
2148			destinationSlice += destination.sliceB;
2149		}
2150
2151		source.unlockRect();
2152		destination.unlockRect();
2153	}
2154
2155	void Surface::decodeA4R4G4B4(Buffer &destination, Buffer &source)
2156	{
2157		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
2158		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
2159
2160		int depth = min(destination.depth, source.depth);
2161		int height = min(destination.height, source.height);
2162		int width = min(destination.width, source.width);
2163
2164		for(int z = 0; z < depth; z++)
2165		{
2166			unsigned char *sourceRow = sourceSlice;
2167			unsigned char *destinationRow = destinationSlice;
2168
2169			for(int y = 0; y < height; y++)
2170			{
2171				unsigned char *sourceElement = sourceRow;
2172				unsigned char *destinationElement = destinationRow;
2173
2174				for(int x = 0; x < width; x++)
2175				{
2176					unsigned int argb = *(unsigned short*)sourceElement;
2177
2178					unsigned int a = ((argb & 0xF000) * 0x00011000) & 0xFF000000;
2179					unsigned int r = ((argb & 0x0F00) * 0x00001100) & 0x00FF0000;
2180					unsigned int g = ((argb & 0x00F0) * 0x00000110) & 0x0000FF00;
2181					unsigned int b =  (argb & 0x000F) * 0x00000011;
2182
2183					*(unsigned int*)destinationElement = a | r | g | b;
2184
2185					sourceElement += source.bytes;
2186					destinationElement += destination.bytes;
2187				}
2188
2189				sourceRow += source.pitchB;
2190				destinationRow += destination.pitchB;
2191			}
2192
2193			sourceSlice += source.sliceB;
2194			destinationSlice += destination.sliceB;
2195		}
2196
2197		source.unlockRect();
2198		destination.unlockRect();
2199	}
2200
2201	void Surface::decodeP8(Buffer &destination, Buffer &source)
2202	{
2203		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
2204		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
2205
2206		int depth = min(destination.depth, source.depth);
2207		int height = min(destination.height, source.height);
2208		int width = min(destination.width, source.width);
2209
2210		for(int z = 0; z < depth; z++)
2211		{
2212			unsigned char *sourceRow = sourceSlice;
2213			unsigned char *destinationRow = destinationSlice;
2214
2215			for(int y = 0; y < height; y++)
2216			{
2217				unsigned char *sourceElement = sourceRow;
2218				unsigned char *destinationElement = destinationRow;
2219
2220				for(int x = 0; x < width; x++)
2221				{
2222					unsigned int abgr = palette[*(unsigned char*)sourceElement];
2223
2224					unsigned int r = (abgr & 0x000000FF) << 16;
2225					unsigned int g = (abgr & 0x0000FF00) << 0;
2226					unsigned int b = (abgr & 0x00FF0000) >> 16;
2227					unsigned int a = (abgr & 0xFF000000) >> 0;
2228
2229					*(unsigned int*)destinationElement = a | r | g | b;
2230
2231					sourceElement += source.bytes;
2232					destinationElement += destination.bytes;
2233				}
2234
2235				sourceRow += source.pitchB;
2236				destinationRow += destination.pitchB;
2237			}
2238
2239			sourceSlice += source.sliceB;
2240			destinationSlice += destination.sliceB;
2241		}
2242
2243		source.unlockRect();
2244		destination.unlockRect();
2245	}
2246
2247	void Surface::decodeDXT1(Buffer &internal, Buffer &external)
2248	{
2249		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2250		const DXT1 *source = (const DXT1*)external.lockRect(0, 0, 0, LOCK_READONLY);
2251
2252		for(int z = 0; z < external.depth; z++)
2253		{
2254			unsigned int *dest = destSlice;
2255
2256			for(int y = 0; y < external.height; y += 4)
2257			{
2258				for(int x = 0; x < external.width; x += 4)
2259				{
2260					Color<byte> c[4];
2261
2262					c[0] = source->c0;
2263					c[1] = source->c1;
2264
2265					if(source->c0 > source->c1)   // No transparency
2266					{
2267						// c2 = 2 / 3 * c0 + 1 / 3 * c1
2268						c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2269						c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2270						c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2271						c[2].a = 0xFF;
2272
2273						// c3 = 1 / 3 * c0 + 2 / 3 * c1
2274						c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2275						c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2276						c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2277						c[3].a = 0xFF;
2278					}
2279					else   // c3 transparent
2280					{
2281						// c2 = 1 / 2 * c0 + 1 / 2 * c1
2282						c[2].r = (byte)(((word)c[0].r + (word)c[1].r) / 2);
2283						c[2].g = (byte)(((word)c[0].g + (word)c[1].g) / 2);
2284						c[2].b = (byte)(((word)c[0].b + (word)c[1].b) / 2);
2285						c[2].a = 0xFF;
2286
2287						c[3].r = 0;
2288						c[3].g = 0;
2289						c[3].b = 0;
2290						c[3].a = 0;
2291					}
2292
2293					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2294					{
2295						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2296						{
2297							dest[(x + i) + (y + j) * internal.width] = c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4];
2298						}
2299					}
2300
2301					source++;
2302				}
2303			}
2304
2305			(byte*&)destSlice += internal.sliceB;
2306		}
2307
2308		external.unlockRect();
2309		internal.unlockRect();
2310	}
2311
2312	void Surface::decodeDXT3(Buffer &internal, Buffer &external)
2313	{
2314		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2315		const DXT3 *source = (const DXT3*)external.lockRect(0, 0, 0, LOCK_READONLY);
2316
2317		for(int z = 0; z < external.depth; z++)
2318		{
2319			unsigned int *dest = destSlice;
2320
2321			for(int y = 0; y < external.height; y += 4)
2322			{
2323				for(int x = 0; x < external.width; x += 4)
2324				{
2325					Color<byte> c[4];
2326
2327					c[0] = source->c0;
2328					c[1] = source->c1;
2329
2330					// c2 = 2 / 3 * c0 + 1 / 3 * c1
2331					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2332					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2333					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2334
2335					// c3 = 1 / 3 * c0 + 2 / 3 * c1
2336					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2337					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2338					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2339
2340					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2341					{
2342						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2343						{
2344							unsigned int a = (unsigned int)(source->a >> 4 * (i + j * 4)) & 0x0F;
2345							unsigned int color = (c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | ((a << 28) + (a << 24));
2346
2347							dest[(x + i) + (y + j) * internal.width] = color;
2348						}
2349					}
2350
2351					source++;
2352				}
2353			}
2354
2355			(byte*&)destSlice += internal.sliceB;
2356		}
2357
2358		external.unlockRect();
2359		internal.unlockRect();
2360	}
2361
2362	void Surface::decodeDXT5(Buffer &internal, Buffer &external)
2363	{
2364		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2365		const DXT5 *source = (const DXT5*)external.lockRect(0, 0, 0, LOCK_READONLY);
2366
2367		for(int z = 0; z < external.depth; z++)
2368		{
2369			unsigned int *dest = destSlice;
2370
2371			for(int y = 0; y < external.height; y += 4)
2372			{
2373				for(int x = 0; x < external.width; x += 4)
2374				{
2375					Color<byte> c[4];
2376
2377					c[0] = source->c0;
2378					c[1] = source->c1;
2379
2380					// c2 = 2 / 3 * c0 + 1 / 3 * c1
2381					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2382					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2383					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2384
2385					// c3 = 1 / 3 * c0 + 2 / 3 * c1
2386					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2387					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2388					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2389
2390					byte a[8];
2391
2392					a[0] = source->a0;
2393					a[1] = source->a1;
2394
2395					if(a[0] > a[1])
2396					{
2397						a[2] = (byte)((6 * (word)a[0] + 1 * (word)a[1] + 3) / 7);
2398						a[3] = (byte)((5 * (word)a[0] + 2 * (word)a[1] + 3) / 7);
2399						a[4] = (byte)((4 * (word)a[0] + 3 * (word)a[1] + 3) / 7);
2400						a[5] = (byte)((3 * (word)a[0] + 4 * (word)a[1] + 3) / 7);
2401						a[6] = (byte)((2 * (word)a[0] + 5 * (word)a[1] + 3) / 7);
2402						a[7] = (byte)((1 * (word)a[0] + 6 * (word)a[1] + 3) / 7);
2403					}
2404					else
2405					{
2406						a[2] = (byte)((4 * (word)a[0] + 1 * (word)a[1] + 2) / 5);
2407						a[3] = (byte)((3 * (word)a[0] + 2 * (word)a[1] + 2) / 5);
2408						a[4] = (byte)((2 * (word)a[0] + 3 * (word)a[1] + 2) / 5);
2409						a[5] = (byte)((1 * (word)a[0] + 4 * (word)a[1] + 2) / 5);
2410						a[6] = 0;
2411						a[7] = 0xFF;
2412					}
2413
2414					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2415					{
2416						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2417						{
2418							unsigned int alpha = (unsigned int)a[(unsigned int)(source->alut >> (16 + 3 * (i + j * 4))) % 8] << 24;
2419							unsigned int color = (c[(source->clut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | alpha;
2420
2421							dest[(x + i) + (y + j) * internal.width] = color;
2422						}
2423					}
2424
2425					source++;
2426				}
2427			}
2428
2429			(byte*&)destSlice += internal.sliceB;
2430		}
2431
2432		external.unlockRect();
2433		internal.unlockRect();
2434	}
2435
2436	void Surface::decodeATI1(Buffer &internal, Buffer &external)
2437	{
2438		byte *destSlice = (byte*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2439		const ATI1 *source = (const ATI1*)external.lockRect(0, 0, 0, LOCK_READONLY);
2440
2441		for(int z = 0; z < external.depth; z++)
2442		{
2443			byte *dest = destSlice;
2444
2445			for(int y = 0; y < external.height; y += 4)
2446			{
2447				for(int x = 0; x < external.width; x += 4)
2448				{
2449					byte r[8];
2450
2451					r[0] = source->r0;
2452					r[1] = source->r1;
2453
2454					if(r[0] > r[1])
2455					{
2456						r[2] = (byte)((6 * (word)r[0] + 1 * (word)r[1] + 3) / 7);
2457						r[3] = (byte)((5 * (word)r[0] + 2 * (word)r[1] + 3) / 7);
2458						r[4] = (byte)((4 * (word)r[0] + 3 * (word)r[1] + 3) / 7);
2459						r[5] = (byte)((3 * (word)r[0] + 4 * (word)r[1] + 3) / 7);
2460						r[6] = (byte)((2 * (word)r[0] + 5 * (word)r[1] + 3) / 7);
2461						r[7] = (byte)((1 * (word)r[0] + 6 * (word)r[1] + 3) / 7);
2462					}
2463					else
2464					{
2465						r[2] = (byte)((4 * (word)r[0] + 1 * (word)r[1] + 2) / 5);
2466						r[3] = (byte)((3 * (word)r[0] + 2 * (word)r[1] + 2) / 5);
2467						r[4] = (byte)((2 * (word)r[0] + 3 * (word)r[1] + 2) / 5);
2468						r[5] = (byte)((1 * (word)r[0] + 4 * (word)r[1] + 2) / 5);
2469						r[6] = 0;
2470						r[7] = 0xFF;
2471					}
2472
2473					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2474					{
2475						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2476						{
2477							dest[(x + i) + (y + j) * internal.width] = r[(unsigned int)(source->rlut >> (16 + 3 * (i + j * 4))) % 8];
2478						}
2479					}
2480
2481					source++;
2482				}
2483			}
2484
2485			destSlice += internal.sliceB;
2486		}
2487
2488		external.unlockRect();
2489		internal.unlockRect();
2490	}
2491
2492	void Surface::decodeATI2(Buffer &internal, Buffer &external)
2493	{
2494		word *destSlice = (word*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2495		const ATI2 *source = (const ATI2*)external.lockRect(0, 0, 0, LOCK_READONLY);
2496
2497		for(int z = 0; z < external.depth; z++)
2498		{
2499			word *dest = destSlice;
2500
2501			for(int y = 0; y < external.height; y += 4)
2502			{
2503				for(int x = 0; x < external.width; x += 4)
2504				{
2505					byte X[8];
2506
2507					X[0] = source->x0;
2508					X[1] = source->x1;
2509
2510					if(X[0] > X[1])
2511					{
2512						X[2] = (byte)((6 * (word)X[0] + 1 * (word)X[1] + 3) / 7);
2513						X[3] = (byte)((5 * (word)X[0] + 2 * (word)X[1] + 3) / 7);
2514						X[4] = (byte)((4 * (word)X[0] + 3 * (word)X[1] + 3) / 7);
2515						X[5] = (byte)((3 * (word)X[0] + 4 * (word)X[1] + 3) / 7);
2516						X[6] = (byte)((2 * (word)X[0] + 5 * (word)X[1] + 3) / 7);
2517						X[7] = (byte)((1 * (word)X[0] + 6 * (word)X[1] + 3) / 7);
2518					}
2519					else
2520					{
2521						X[2] = (byte)((4 * (word)X[0] + 1 * (word)X[1] + 2) / 5);
2522						X[3] = (byte)((3 * (word)X[0] + 2 * (word)X[1] + 2) / 5);
2523						X[4] = (byte)((2 * (word)X[0] + 3 * (word)X[1] + 2) / 5);
2524						X[5] = (byte)((1 * (word)X[0] + 4 * (word)X[1] + 2) / 5);
2525						X[6] = 0;
2526						X[7] = 0xFF;
2527					}
2528
2529					byte Y[8];
2530
2531					Y[0] = source->y0;
2532					Y[1] = source->y1;
2533
2534					if(Y[0] > Y[1])
2535					{
2536						Y[2] = (byte)((6 * (word)Y[0] + 1 * (word)Y[1] + 3) / 7);
2537						Y[3] = (byte)((5 * (word)Y[0] + 2 * (word)Y[1] + 3) / 7);
2538						Y[4] = (byte)((4 * (word)Y[0] + 3 * (word)Y[1] + 3) / 7);
2539						Y[5] = (byte)((3 * (word)Y[0] + 4 * (word)Y[1] + 3) / 7);
2540						Y[6] = (byte)((2 * (word)Y[0] + 5 * (word)Y[1] + 3) / 7);
2541						Y[7] = (byte)((1 * (word)Y[0] + 6 * (word)Y[1] + 3) / 7);
2542					}
2543					else
2544					{
2545						Y[2] = (byte)((4 * (word)Y[0] + 1 * (word)Y[1] + 2) / 5);
2546						Y[3] = (byte)((3 * (word)Y[0] + 2 * (word)Y[1] + 2) / 5);
2547						Y[4] = (byte)((2 * (word)Y[0] + 3 * (word)Y[1] + 2) / 5);
2548						Y[5] = (byte)((1 * (word)Y[0] + 4 * (word)Y[1] + 2) / 5);
2549						Y[6] = 0;
2550						Y[7] = 0xFF;
2551					}
2552
2553					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2554					{
2555						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2556						{
2557							word r = X[(unsigned int)(source->xlut >> (16 + 3 * (i + j * 4))) % 8];
2558							word g = Y[(unsigned int)(source->ylut >> (16 + 3 * (i + j * 4))) % 8];
2559
2560							dest[(x + i) + (y + j) * internal.width] = (g << 8) + r;
2561						}
2562					}
2563
2564					source++;
2565				}
2566			}
2567
2568			(byte*&)destSlice += internal.sliceB;
2569		}
2570
2571		external.unlockRect();
2572		internal.unlockRect();
2573	}
2574
2575	void Surface::decodeETC2(Buffer &internal, Buffer &external, int nbAlphaBits, bool isSRGB)
2576	{
2577		ETC_Decoder::Decode((const byte*)external.lockRect(0, 0, 0, LOCK_READONLY), (byte*)internal.lockRect(0, 0, 0, LOCK_UPDATE), external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
2578		                    (nbAlphaBits == 8) ? ETC_Decoder::ETC_RGBA : ((nbAlphaBits == 1) ? ETC_Decoder::ETC_RGB_PUNCHTHROUGH_ALPHA : ETC_Decoder::ETC_RGB));
2579		external.unlockRect();
2580		internal.unlockRect();
2581
2582		if(isSRGB)
2583		{
2584			static byte sRGBtoLinearTable[256];
2585			static bool sRGBtoLinearTableDirty = true;
2586			if(sRGBtoLinearTableDirty)
2587			{
2588				for(int i = 0; i < 256; i++)
2589				{
2590					sRGBtoLinearTable[i] = static_cast<byte>(sRGBtoLinear(static_cast<float>(i) / 255.0f) * 255.0f + 0.5f);
2591				}
2592				sRGBtoLinearTableDirty = false;
2593			}
2594
2595			// Perform sRGB conversion in place after decoding
2596			byte *src = (byte*)internal.lockRect(0, 0, 0, LOCK_READWRITE);
2597			for(int y = 0; y < internal.height; y++)
2598			{
2599				byte *srcRow = src + y * internal.pitchB;
2600				for(int x = 0; x <  internal.width; x++)
2601				{
2602					byte *srcPix = srcRow + x * internal.bytes;
2603					for(int i = 0; i < 3; i++)
2604					{
2605						srcPix[i] = sRGBtoLinearTable[srcPix[i]];
2606					}
2607				}
2608			}
2609			internal.unlockRect();
2610		}
2611	}
2612
2613	void Surface::decodeEAC(Buffer &internal, Buffer &external, int nbChannels, bool isSigned)
2614	{
2615		ASSERT(nbChannels == 1 || nbChannels == 2);
2616
2617		byte *src = (byte*)internal.lockRect(0, 0, 0, LOCK_READWRITE);
2618		ETC_Decoder::Decode((const byte*)external.lockRect(0, 0, 0, LOCK_READONLY), src, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
2619		                    (nbChannels == 1) ? (isSigned ? ETC_Decoder::ETC_R_SIGNED : ETC_Decoder::ETC_R_UNSIGNED) : (isSigned ? ETC_Decoder::ETC_RG_SIGNED : ETC_Decoder::ETC_RG_UNSIGNED));
2620		external.unlockRect();
2621
2622		// FIXME: We convert EAC data to float, until signed short internal formats are supported
2623		//        This code can be removed if ETC2 images are decoded to internal 16 bit signed R/RG formats
2624		const float normalization = isSigned ? (1.0f / (8.0f * 127.875f)) : (1.0f / (8.0f * 255.875f));
2625		for(int y = 0; y < internal.height; y++)
2626		{
2627			byte* srcRow = src + y * internal.pitchB;
2628			for(int x = internal.width - 1; x >= 0; x--)
2629			{
2630				int* srcPix = reinterpret_cast<int*>(srcRow + x * internal.bytes);
2631				float* dstPix = reinterpret_cast<float*>(srcPix);
2632				for(int c = nbChannels - 1; c >= 0; c--)
2633				{
2634					dstPix[c] = clamp(static_cast<float>(srcPix[c]) * normalization, -1.0f, 1.0f);
2635				}
2636			}
2637		}
2638
2639		internal.unlockRect();
2640	}
2641
2642	void Surface::decodeASTC(Buffer &internal, Buffer &external, int xBlockSize, int yBlockSize, int zBlockSize, bool isSRGB)
2643	{
2644	}
2645
2646	unsigned int Surface::size(int width, int height, int depth, int border, int samples, Format format)
2647	{
2648		width += 2 * border;
2649		height += 2 * border;
2650
2651		// Dimensions rounded up to multiples of 4, used for compressed formats
2652		int width4 = align(width, 4);
2653		int height4 = align(height, 4);
2654
2655		switch(format)
2656		{
2657		case FORMAT_DXT1:
2658		case FORMAT_ATI1:
2659		case FORMAT_ETC1:
2660		case FORMAT_R11_EAC:
2661		case FORMAT_SIGNED_R11_EAC:
2662		case FORMAT_RGB8_ETC2:
2663		case FORMAT_SRGB8_ETC2:
2664		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2665		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2666			return width4 * height4 * depth / 2;
2667		case FORMAT_DXT3:
2668		case FORMAT_DXT5:
2669		case FORMAT_ATI2:
2670		case FORMAT_RG11_EAC:
2671		case FORMAT_SIGNED_RG11_EAC:
2672		case FORMAT_RGBA8_ETC2_EAC:
2673		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
2674		case FORMAT_RGBA_ASTC_4x4_KHR:
2675		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
2676			return width4 * height4 * depth;
2677		case FORMAT_RGBA_ASTC_5x4_KHR:
2678		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
2679			return align(width, 5) * height4 * depth;
2680		case FORMAT_RGBA_ASTC_5x5_KHR:
2681		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
2682			return align(width, 5) * align(height, 5) * depth;
2683		case FORMAT_RGBA_ASTC_6x5_KHR:
2684		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
2685			return align(width, 6) * align(height, 5) * depth;
2686		case FORMAT_RGBA_ASTC_6x6_KHR:
2687		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
2688			return align(width, 6) * align(height, 6) * depth;
2689		case FORMAT_RGBA_ASTC_8x5_KHR:
2690		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
2691			return align(width, 8) * align(height, 5) * depth;
2692		case FORMAT_RGBA_ASTC_8x6_KHR:
2693		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
2694			return align(width, 8) * align(height, 6) * depth;
2695		case FORMAT_RGBA_ASTC_8x8_KHR:
2696		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
2697			return align(width, 8) * align(height, 8) * depth;
2698		case FORMAT_RGBA_ASTC_10x5_KHR:
2699		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
2700			return align(width, 10) * align(height, 5) * depth;
2701		case FORMAT_RGBA_ASTC_10x6_KHR:
2702		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
2703			return align(width, 10) * align(height, 6) * depth;
2704		case FORMAT_RGBA_ASTC_10x8_KHR:
2705		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
2706			return align(width, 10) * align(height, 8) * depth;
2707		case FORMAT_RGBA_ASTC_10x10_KHR:
2708		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
2709			return align(width, 10) * align(height, 10) * depth;
2710		case FORMAT_RGBA_ASTC_12x10_KHR:
2711		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
2712			return align(width, 12) * align(height, 10) * depth;
2713		case FORMAT_RGBA_ASTC_12x12_KHR:
2714		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
2715			return align(width, 12) * align(height, 12) * depth;
2716		case FORMAT_YV12_BT601:
2717		case FORMAT_YV12_BT709:
2718		case FORMAT_YV12_JFIF:
2719			{
2720				unsigned int YStride = align(width, 16);
2721				unsigned int YSize = YStride * height;
2722				unsigned int CStride = align(YStride / 2, 16);
2723				unsigned int CSize = CStride * height / 2;
2724
2725				return YSize + 2 * CSize;
2726			}
2727		default:
2728			return bytes(format) * width * height * depth * samples;
2729		}
2730	}
2731
2732	bool Surface::isStencil(Format format)
2733	{
2734		switch(format)
2735		{
2736		case FORMAT_D32:
2737		case FORMAT_D16:
2738		case FORMAT_D24X8:
2739		case FORMAT_D32F:
2740		case FORMAT_D32F_COMPLEMENTARY:
2741		case FORMAT_D32F_LOCKABLE:
2742		case FORMAT_D32F_SHADOW:
2743			return false;
2744		case FORMAT_D24S8:
2745		case FORMAT_D24FS8:
2746		case FORMAT_S8:
2747		case FORMAT_DF24S8:
2748		case FORMAT_DF16S8:
2749		case FORMAT_D32FS8_TEXTURE:
2750		case FORMAT_D32FS8_SHADOW:
2751		case FORMAT_D32FS8:
2752		case FORMAT_D32FS8_COMPLEMENTARY:
2753		case FORMAT_INTZ:
2754			return true;
2755		default:
2756			return false;
2757		}
2758	}
2759
2760	bool Surface::isDepth(Format format)
2761	{
2762		switch(format)
2763		{
2764		case FORMAT_D32:
2765		case FORMAT_D16:
2766		case FORMAT_D24X8:
2767		case FORMAT_D24S8:
2768		case FORMAT_D24FS8:
2769		case FORMAT_D32F:
2770		case FORMAT_D32FS8:
2771		case FORMAT_D32F_COMPLEMENTARY:
2772		case FORMAT_D32FS8_COMPLEMENTARY:
2773		case FORMAT_D32F_LOCKABLE:
2774		case FORMAT_DF24S8:
2775		case FORMAT_DF16S8:
2776		case FORMAT_D32FS8_TEXTURE:
2777		case FORMAT_D32F_SHADOW:
2778		case FORMAT_D32FS8_SHADOW:
2779		case FORMAT_INTZ:
2780			return true;
2781		case FORMAT_S8:
2782			return false;
2783		default:
2784			return false;
2785		}
2786	}
2787
2788	bool Surface::hasQuadLayout(Format format)
2789	{
2790		switch(format)
2791		{
2792		case FORMAT_D32:
2793		case FORMAT_D16:
2794		case FORMAT_D24X8:
2795		case FORMAT_D24S8:
2796		case FORMAT_D24FS8:
2797		case FORMAT_D32F:
2798		case FORMAT_D32FS8:
2799		case FORMAT_D32F_COMPLEMENTARY:
2800		case FORMAT_D32FS8_COMPLEMENTARY:
2801		case FORMAT_DF24S8:
2802		case FORMAT_DF16S8:
2803		case FORMAT_INTZ:
2804		case FORMAT_S8:
2805		case FORMAT_A8G8R8B8Q:
2806		case FORMAT_X8G8R8B8Q:
2807			return true;
2808		case FORMAT_D32F_LOCKABLE:
2809		case FORMAT_D32FS8_TEXTURE:
2810		case FORMAT_D32F_SHADOW:
2811		case FORMAT_D32FS8_SHADOW:
2812		default:
2813			break;
2814		}
2815
2816		return false;
2817	}
2818
2819	bool Surface::isPalette(Format format)
2820	{
2821		switch(format)
2822		{
2823		case FORMAT_P8:
2824		case FORMAT_A8P8:
2825			return true;
2826		default:
2827			return false;
2828		}
2829	}
2830
2831	bool Surface::isFloatFormat(Format format)
2832	{
2833		switch(format)
2834		{
2835		case FORMAT_R5G6B5:
2836		case FORMAT_R8G8B8:
2837		case FORMAT_B8G8R8:
2838		case FORMAT_X8R8G8B8:
2839		case FORMAT_X8B8G8R8I:
2840		case FORMAT_X8B8G8R8:
2841		case FORMAT_A8R8G8B8:
2842		case FORMAT_SRGB8_X8:
2843		case FORMAT_SRGB8_A8:
2844		case FORMAT_A8B8G8R8I:
2845		case FORMAT_R8UI:
2846		case FORMAT_G8R8UI:
2847		case FORMAT_X8B8G8R8UI:
2848		case FORMAT_A8B8G8R8UI:
2849		case FORMAT_A8B8G8R8:
2850		case FORMAT_G8R8I:
2851		case FORMAT_G8R8:
2852		case FORMAT_A2B10G10R10:
2853		case FORMAT_A2B10G10R10UI:
2854		case FORMAT_R8_SNORM:
2855		case FORMAT_G8R8_SNORM:
2856		case FORMAT_X8B8G8R8_SNORM:
2857		case FORMAT_A8B8G8R8_SNORM:
2858		case FORMAT_R16I:
2859		case FORMAT_R16UI:
2860		case FORMAT_G16R16I:
2861		case FORMAT_G16R16UI:
2862		case FORMAT_G16R16:
2863		case FORMAT_X16B16G16R16I:
2864		case FORMAT_X16B16G16R16UI:
2865		case FORMAT_A16B16G16R16I:
2866		case FORMAT_A16B16G16R16UI:
2867		case FORMAT_A16B16G16R16:
2868		case FORMAT_V8U8:
2869		case FORMAT_Q8W8V8U8:
2870		case FORMAT_X8L8V8U8:
2871		case FORMAT_V16U16:
2872		case FORMAT_A16W16V16U16:
2873		case FORMAT_Q16W16V16U16:
2874		case FORMAT_A8:
2875		case FORMAT_R8I:
2876		case FORMAT_R8:
2877		case FORMAT_S8:
2878		case FORMAT_L8:
2879		case FORMAT_L16:
2880		case FORMAT_A8L8:
2881		case FORMAT_YV12_BT601:
2882		case FORMAT_YV12_BT709:
2883		case FORMAT_YV12_JFIF:
2884		case FORMAT_R32I:
2885		case FORMAT_R32UI:
2886		case FORMAT_G32R32I:
2887		case FORMAT_G32R32UI:
2888		case FORMAT_X32B32G32R32I:
2889		case FORMAT_X32B32G32R32UI:
2890		case FORMAT_A32B32G32R32I:
2891		case FORMAT_A32B32G32R32UI:
2892			return false;
2893		case FORMAT_R16F:
2894		case FORMAT_G16R16F:
2895		case FORMAT_B16G16R16F:
2896		case FORMAT_X16B16G16R16F:
2897		case FORMAT_A16B16G16R16F:
2898		case FORMAT_X16B16G16R16F_UNSIGNED:
2899		case FORMAT_R32F:
2900		case FORMAT_G32R32F:
2901		case FORMAT_B32G32R32F:
2902		case FORMAT_X32B32G32R32F:
2903		case FORMAT_A32B32G32R32F:
2904		case FORMAT_X32B32G32R32F_UNSIGNED:
2905		case FORMAT_D32F:
2906		case FORMAT_D32FS8:
2907		case FORMAT_D32F_COMPLEMENTARY:
2908		case FORMAT_D32FS8_COMPLEMENTARY:
2909		case FORMAT_D32F_LOCKABLE:
2910		case FORMAT_D32FS8_TEXTURE:
2911		case FORMAT_D32F_SHADOW:
2912		case FORMAT_D32FS8_SHADOW:
2913		case FORMAT_L16F:
2914		case FORMAT_A16L16F:
2915		case FORMAT_L32F:
2916		case FORMAT_A32L32F:
2917			return true;
2918		default:
2919			ASSERT(false);
2920		}
2921
2922		return false;
2923	}
2924
2925	bool Surface::isUnsignedComponent(Format format, int component)
2926	{
2927		switch(format)
2928		{
2929		case FORMAT_NULL:
2930		case FORMAT_R5G6B5:
2931		case FORMAT_R8G8B8:
2932		case FORMAT_B8G8R8:
2933		case FORMAT_X8R8G8B8:
2934		case FORMAT_X8B8G8R8:
2935		case FORMAT_A8R8G8B8:
2936		case FORMAT_A8B8G8R8:
2937		case FORMAT_SRGB8_X8:
2938		case FORMAT_SRGB8_A8:
2939		case FORMAT_G8R8:
2940		case FORMAT_A2B10G10R10:
2941		case FORMAT_A2B10G10R10UI:
2942		case FORMAT_R16UI:
2943		case FORMAT_G16R16:
2944		case FORMAT_G16R16UI:
2945		case FORMAT_X16B16G16R16UI:
2946		case FORMAT_A16B16G16R16:
2947		case FORMAT_A16B16G16R16UI:
2948		case FORMAT_R32UI:
2949		case FORMAT_G32R32UI:
2950		case FORMAT_X32B32G32R32UI:
2951		case FORMAT_A32B32G32R32UI:
2952		case FORMAT_X32B32G32R32F_UNSIGNED:
2953		case FORMAT_R8UI:
2954		case FORMAT_G8R8UI:
2955		case FORMAT_X8B8G8R8UI:
2956		case FORMAT_A8B8G8R8UI:
2957		case FORMAT_D32F:
2958		case FORMAT_D32FS8:
2959		case FORMAT_D32F_COMPLEMENTARY:
2960		case FORMAT_D32FS8_COMPLEMENTARY:
2961		case FORMAT_D32F_LOCKABLE:
2962		case FORMAT_D32FS8_TEXTURE:
2963		case FORMAT_D32F_SHADOW:
2964		case FORMAT_D32FS8_SHADOW:
2965		case FORMAT_A8:
2966		case FORMAT_R8:
2967		case FORMAT_L8:
2968		case FORMAT_L16:
2969		case FORMAT_A8L8:
2970		case FORMAT_YV12_BT601:
2971		case FORMAT_YV12_BT709:
2972		case FORMAT_YV12_JFIF:
2973			return true;
2974		case FORMAT_A8B8G8R8I:
2975		case FORMAT_A16B16G16R16I:
2976		case FORMAT_A32B32G32R32I:
2977		case FORMAT_A8B8G8R8_SNORM:
2978		case FORMAT_Q8W8V8U8:
2979		case FORMAT_Q16W16V16U16:
2980		case FORMAT_A32B32G32R32F:
2981			return false;
2982		case FORMAT_R32F:
2983		case FORMAT_R8I:
2984		case FORMAT_R16I:
2985		case FORMAT_R32I:
2986		case FORMAT_R8_SNORM:
2987			return component >= 1;
2988		case FORMAT_V8U8:
2989		case FORMAT_X8L8V8U8:
2990		case FORMAT_V16U16:
2991		case FORMAT_G32R32F:
2992		case FORMAT_G8R8I:
2993		case FORMAT_G16R16I:
2994		case FORMAT_G32R32I:
2995		case FORMAT_G8R8_SNORM:
2996			return component >= 2;
2997		case FORMAT_A16W16V16U16:
2998		case FORMAT_B32G32R32F:
2999		case FORMAT_X32B32G32R32F:
3000		case FORMAT_X8B8G8R8I:
3001		case FORMAT_X16B16G16R16I:
3002		case FORMAT_X32B32G32R32I:
3003		case FORMAT_X8B8G8R8_SNORM:
3004			return component >= 3;
3005		default:
3006			ASSERT(false);
3007		}
3008
3009		return false;
3010	}
3011
3012	bool Surface::isSRGBreadable(Format format)
3013	{
3014		// Keep in sync with Capabilities::isSRGBreadable
3015		switch(format)
3016		{
3017		case FORMAT_L8:
3018		case FORMAT_A8L8:
3019		case FORMAT_R8G8B8:
3020		case FORMAT_A8R8G8B8:
3021		case FORMAT_X8R8G8B8:
3022		case FORMAT_A8B8G8R8:
3023		case FORMAT_X8B8G8R8:
3024		case FORMAT_SRGB8_X8:
3025		case FORMAT_SRGB8_A8:
3026		case FORMAT_R5G6B5:
3027		case FORMAT_X1R5G5B5:
3028		case FORMAT_A1R5G5B5:
3029		case FORMAT_A4R4G4B4:
3030		case FORMAT_DXT1:
3031		case FORMAT_DXT3:
3032		case FORMAT_DXT5:
3033		case FORMAT_ATI1:
3034		case FORMAT_ATI2:
3035			return true;
3036		default:
3037			return false;
3038		}
3039	}
3040
3041	bool Surface::isSRGBwritable(Format format)
3042	{
3043		// Keep in sync with Capabilities::isSRGBwritable
3044		switch(format)
3045		{
3046		case FORMAT_NULL:
3047		case FORMAT_A8R8G8B8:
3048		case FORMAT_X8R8G8B8:
3049		case FORMAT_A8B8G8R8:
3050		case FORMAT_X8B8G8R8:
3051		case FORMAT_SRGB8_X8:
3052		case FORMAT_SRGB8_A8:
3053		case FORMAT_R5G6B5:
3054			return true;
3055		default:
3056			return false;
3057		}
3058	}
3059
3060	bool Surface::isSRGBformat(Format format)
3061	{
3062		switch(format)
3063		{
3064		case FORMAT_SRGB8_X8:
3065		case FORMAT_SRGB8_A8:
3066			return true;
3067		default:
3068			return false;
3069		}
3070	}
3071
3072	bool Surface::isCompressed(Format format)
3073	{
3074		switch(format)
3075		{
3076		case FORMAT_DXT1:
3077		case FORMAT_DXT3:
3078		case FORMAT_DXT5:
3079		case FORMAT_ATI1:
3080		case FORMAT_ATI2:
3081		case FORMAT_ETC1:
3082		case FORMAT_R11_EAC:
3083		case FORMAT_SIGNED_R11_EAC:
3084		case FORMAT_RG11_EAC:
3085		case FORMAT_SIGNED_RG11_EAC:
3086		case FORMAT_RGB8_ETC2:
3087		case FORMAT_SRGB8_ETC2:
3088		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3089		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3090		case FORMAT_RGBA8_ETC2_EAC:
3091		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
3092		case FORMAT_RGBA_ASTC_4x4_KHR:
3093		case FORMAT_RGBA_ASTC_5x4_KHR:
3094		case FORMAT_RGBA_ASTC_5x5_KHR:
3095		case FORMAT_RGBA_ASTC_6x5_KHR:
3096		case FORMAT_RGBA_ASTC_6x6_KHR:
3097		case FORMAT_RGBA_ASTC_8x5_KHR:
3098		case FORMAT_RGBA_ASTC_8x6_KHR:
3099		case FORMAT_RGBA_ASTC_8x8_KHR:
3100		case FORMAT_RGBA_ASTC_10x5_KHR:
3101		case FORMAT_RGBA_ASTC_10x6_KHR:
3102		case FORMAT_RGBA_ASTC_10x8_KHR:
3103		case FORMAT_RGBA_ASTC_10x10_KHR:
3104		case FORMAT_RGBA_ASTC_12x10_KHR:
3105		case FORMAT_RGBA_ASTC_12x12_KHR:
3106		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
3107		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
3108		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
3109		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
3110		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
3111		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
3112		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
3113		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
3114		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
3115		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
3116		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
3117		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
3118		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
3119		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
3120			return true;
3121		default:
3122			return false;
3123		}
3124	}
3125
3126	bool Surface::isSignedNonNormalizedInteger(Format format)
3127	{
3128		switch(format)
3129		{
3130		case FORMAT_A8B8G8R8I:
3131		case FORMAT_X8B8G8R8I:
3132		case FORMAT_G8R8I:
3133		case FORMAT_R8I:
3134		case FORMAT_A16B16G16R16I:
3135		case FORMAT_X16B16G16R16I:
3136		case FORMAT_G16R16I:
3137		case FORMAT_R16I:
3138		case FORMAT_A32B32G32R32I:
3139		case FORMAT_X32B32G32R32I:
3140		case FORMAT_G32R32I:
3141		case FORMAT_R32I:
3142			return true;
3143		default:
3144			return false;
3145		}
3146	}
3147
3148	bool Surface::isUnsignedNonNormalizedInteger(Format format)
3149	{
3150		switch(format)
3151		{
3152		case FORMAT_A8B8G8R8UI:
3153		case FORMAT_X8B8G8R8UI:
3154		case FORMAT_G8R8UI:
3155		case FORMAT_R8UI:
3156		case FORMAT_A16B16G16R16UI:
3157		case FORMAT_X16B16G16R16UI:
3158		case FORMAT_G16R16UI:
3159		case FORMAT_R16UI:
3160		case FORMAT_A32B32G32R32UI:
3161		case FORMAT_X32B32G32R32UI:
3162		case FORMAT_G32R32UI:
3163		case FORMAT_R32UI:
3164			return true;
3165		default:
3166			return false;
3167		}
3168	}
3169
3170	bool Surface::isNonNormalizedInteger(Format format)
3171	{
3172		return isSignedNonNormalizedInteger(format) ||
3173		       isUnsignedNonNormalizedInteger(format);
3174	}
3175
3176	bool Surface::isNormalizedInteger(Format format)
3177	{
3178		return !isFloatFormat(format) &&
3179		       !isNonNormalizedInteger(format) &&
3180		       !isCompressed(format) &&
3181		       !isDepth(format) &&
3182		       !isStencil(format);
3183	}
3184
3185	int Surface::componentCount(Format format)
3186	{
3187		switch(format)
3188		{
3189		case FORMAT_R5G6B5:         return 3;
3190		case FORMAT_X8R8G8B8:       return 3;
3191		case FORMAT_X8B8G8R8I:      return 3;
3192		case FORMAT_X8B8G8R8:       return 3;
3193		case FORMAT_A8R8G8B8:       return 4;
3194		case FORMAT_SRGB8_X8:       return 3;
3195		case FORMAT_SRGB8_A8:       return 4;
3196		case FORMAT_A8B8G8R8I:      return 4;
3197		case FORMAT_A8B8G8R8:       return 4;
3198		case FORMAT_G8R8I:          return 2;
3199		case FORMAT_G8R8:           return 2;
3200		case FORMAT_R8_SNORM:      return 1;
3201		case FORMAT_G8R8_SNORM:    return 2;
3202		case FORMAT_X8B8G8R8_SNORM:return 3;
3203		case FORMAT_A8B8G8R8_SNORM:return 4;
3204		case FORMAT_R8UI:           return 1;
3205		case FORMAT_G8R8UI:         return 2;
3206		case FORMAT_X8B8G8R8UI:     return 3;
3207		case FORMAT_A8B8G8R8UI:     return 4;
3208		case FORMAT_A2B10G10R10:    return 4;
3209		case FORMAT_A2B10G10R10UI:  return 4;
3210		case FORMAT_G16R16I:        return 2;
3211		case FORMAT_G16R16UI:       return 2;
3212		case FORMAT_G16R16:         return 2;
3213		case FORMAT_G32R32I:        return 2;
3214		case FORMAT_G32R32UI:       return 2;
3215		case FORMAT_X16B16G16R16I:  return 3;
3216		case FORMAT_X16B16G16R16UI: return 3;
3217		case FORMAT_A16B16G16R16I:  return 4;
3218		case FORMAT_A16B16G16R16UI: return 4;
3219		case FORMAT_A16B16G16R16:   return 4;
3220		case FORMAT_X32B32G32R32I:  return 3;
3221		case FORMAT_X32B32G32R32UI: return 3;
3222		case FORMAT_A32B32G32R32I:  return 4;
3223		case FORMAT_A32B32G32R32UI: return 4;
3224		case FORMAT_V8U8:           return 2;
3225		case FORMAT_Q8W8V8U8:       return 4;
3226		case FORMAT_X8L8V8U8:       return 3;
3227		case FORMAT_V16U16:         return 2;
3228		case FORMAT_A16W16V16U16:   return 4;
3229		case FORMAT_Q16W16V16U16:   return 4;
3230		case FORMAT_R32F:           return 1;
3231		case FORMAT_G32R32F:        return 2;
3232		case FORMAT_X32B32G32R32F:  return 3;
3233		case FORMAT_A32B32G32R32F:  return 4;
3234		case FORMAT_X32B32G32R32F_UNSIGNED: return 3;
3235		case FORMAT_D32F:           return 1;
3236		case FORMAT_D32FS8:         return 1;
3237		case FORMAT_D32F_LOCKABLE:  return 1;
3238		case FORMAT_D32FS8_TEXTURE: return 1;
3239		case FORMAT_D32F_SHADOW:    return 1;
3240		case FORMAT_D32FS8_SHADOW:  return 1;
3241		case FORMAT_A8:             return 1;
3242		case FORMAT_R8I:            return 1;
3243		case FORMAT_R8:             return 1;
3244		case FORMAT_R16I:           return 1;
3245		case FORMAT_R16UI:          return 1;
3246		case FORMAT_R32I:           return 1;
3247		case FORMAT_R32UI:          return 1;
3248		case FORMAT_L8:             return 1;
3249		case FORMAT_L16:            return 1;
3250		case FORMAT_A8L8:           return 2;
3251		case FORMAT_YV12_BT601:     return 3;
3252		case FORMAT_YV12_BT709:     return 3;
3253		case FORMAT_YV12_JFIF:      return 3;
3254		default:
3255			ASSERT(false);
3256		}
3257
3258		return 1;
3259	}
3260
3261	void *Surface::allocateBuffer(int width, int height, int depth, int border, int samples, Format format)
3262	{
3263		// Render targets require 2x2 quads
3264		int width2 = (width + 1) & ~1;
3265		int height2 = (height + 1) & ~1;
3266
3267		// FIXME: Unpacking byte4 to short4 in the sampler currently involves reading 8 bytes,
3268		// and stencil operations also read 8 bytes per four 8-bit stencil values,
3269		// so we have to allocate 4 extra bytes to avoid buffer overruns.
3270		return allocate(size(width2, height2, depth, border, samples, format) + 4);
3271	}
3272
3273	void Surface::memfill4(void *buffer, int pattern, int bytes)
3274	{
3275		while((size_t)buffer & 0x1 && bytes >= 1)
3276		{
3277			*(char*)buffer = (char)pattern;
3278			(char*&)buffer += 1;
3279			bytes -= 1;
3280		}
3281
3282		while((size_t)buffer & 0x3 && bytes >= 2)
3283		{
3284			*(short*)buffer = (short)pattern;
3285			(short*&)buffer += 1;
3286			bytes -= 2;
3287		}
3288
3289		#if defined(__i386__) || defined(__x86_64__)
3290			if(CPUID::supportsSSE())
3291			{
3292				while((size_t)buffer & 0xF && bytes >= 4)
3293				{
3294					*(int*)buffer = pattern;
3295					(int*&)buffer += 1;
3296					bytes -= 4;
3297				}
3298
3299				__m128 quad = _mm_set_ps1((float&)pattern);
3300
3301				float *pointer = (float*)buffer;
3302				int qxwords = bytes / 64;
3303				bytes -= qxwords * 64;
3304
3305				while(qxwords--)
3306				{
3307					_mm_stream_ps(pointer + 0, quad);
3308					_mm_stream_ps(pointer + 4, quad);
3309					_mm_stream_ps(pointer + 8, quad);
3310					_mm_stream_ps(pointer + 12, quad);
3311
3312					pointer += 16;
3313				}
3314
3315				buffer = pointer;
3316			}
3317		#endif
3318
3319		while(bytes >= 4)
3320		{
3321			*(int*)buffer = (int)pattern;
3322			(int*&)buffer += 1;
3323			bytes -= 4;
3324		}
3325
3326		while(bytes >= 2)
3327		{
3328			*(short*)buffer = (short)pattern;
3329			(short*&)buffer += 1;
3330			bytes -= 2;
3331		}
3332
3333		while(bytes >= 1)
3334		{
3335			*(char*)buffer = (char)pattern;
3336			(char*&)buffer += 1;
3337			bytes -= 1;
3338		}
3339	}
3340
3341	void Surface::sync()
3342	{
3343		resource->lock(EXCLUSIVE);
3344		resource->unlock();
3345	}
3346
3347	bool Surface::isEntire(const Rect& rect) const
3348	{
3349		return (rect.x0 == 0 && rect.y0 == 0 && rect.x1 == internal.width && rect.y1 == internal.height && internal.depth == 1);
3350	}
3351
3352	Rect Surface::getRect() const
3353	{
3354		return Rect(0, 0, internal.width, internal.height);
3355	}
3356
3357	void Surface::clearDepth(float depth, int x0, int y0, int width, int height)
3358	{
3359		if(width == 0 || height == 0) return;
3360
3361		// Not overlapping
3362		if(x0 > internal.width) return;
3363		if(y0 > internal.height) return;
3364		if(x0 + width < 0) return;
3365		if(y0 + height < 0) return;
3366
3367		// Clip against dimensions
3368		if(x0 < 0) {width += x0; x0 = 0;}
3369		if(x0 + width > internal.width) width = internal.width - x0;
3370		if(y0 < 0) {height += y0; y0 = 0;}
3371		if(y0 + height > internal.height) height = internal.height - y0;
3372
3373		const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
3374		const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
3375
3376		int x1 = x0 + width;
3377		int y1 = y0 + height;
3378
3379		if(!hasQuadLayout(internal.format))
3380		{
3381			float *target = (float*)lockInternal(x0, y0, 0, lock, PUBLIC);
3382
3383			for(int z = 0; z < internal.samples; z++)
3384			{
3385				float *row = target;
3386				for(int y = y0; y < y1; y++)
3387				{
3388					memfill4(row, (int&)depth, width * sizeof(float));
3389					row += internal.pitchP;
3390				}
3391				target += internal.sliceP;
3392			}
3393
3394			unlockInternal();
3395		}
3396		else   // Quad layout
3397		{
3398			if(complementaryDepthBuffer)
3399			{
3400				depth = 1 - depth;
3401			}
3402
3403			float *buffer = (float*)lockInternal(0, 0, 0, lock, PUBLIC);
3404
3405			int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
3406			int oddX1 = (x1 & ~1) * 2;
3407			int evenX0 = ((x0 + 1) & ~1) * 2;
3408			int evenBytes = (oddX1 - evenX0) * sizeof(float);
3409
3410			for(int z = 0; z < internal.samples; z++)
3411			{
3412				for(int y = y0; y < y1; y++)
3413				{
3414					float *target = buffer + (y & ~1) * internal.pitchP + (y & 1) * 2;
3415
3416					if((y & 1) == 0 && y + 1 < y1)   // Fill quad line at once
3417					{
3418						if((x0 & 1) != 0)
3419						{
3420							target[oddX0 + 0] = depth;
3421							target[oddX0 + 2] = depth;
3422						}
3423
3424					//	for(int x2 = evenX0; x2 < x1 * 2; x2 += 4)
3425					//	{
3426					//		target[x2 + 0] = depth;
3427					//		target[x2 + 1] = depth;
3428					//		target[x2 + 2] = depth;
3429					//		target[x2 + 3] = depth;
3430					//	}
3431
3432					//	__asm
3433					//	{
3434					//		movss xmm0, depth
3435					//		shufps xmm0, xmm0, 0x00
3436					//
3437					//		mov eax, x0
3438					//		add eax, 1
3439					//		and eax, 0xFFFFFFFE
3440					//		cmp eax, x1
3441					//		jge qEnd
3442					//
3443					//		mov edi, target
3444					//
3445					//	qLoop:
3446					//		movntps [edi+8*eax], xmm0
3447					//
3448					//		add eax, 2
3449					//		cmp eax, x1
3450					//		jl qLoop
3451					//	qEnd:
3452					//	}
3453
3454						memfill4(&target[evenX0], (int&)depth, evenBytes);
3455
3456						if((x1 & 1) != 0)
3457						{
3458							target[oddX1 + 0] = depth;
3459							target[oddX1 + 2] = depth;
3460						}
3461
3462						y++;
3463					}
3464					else
3465					{
3466						for(int x = x0, i = oddX0; x < x1; x++, i = (x & ~1) * 2 + (x & 1))
3467						{
3468							target[i] = depth;
3469						}
3470					}
3471				}
3472
3473				buffer += internal.sliceP;
3474			}
3475
3476			unlockInternal();
3477		}
3478	}
3479
3480	void Surface::clearStencil(unsigned char s, unsigned char mask, int x0, int y0, int width, int height)
3481	{
3482		if(mask == 0 || width == 0 || height == 0) return;
3483
3484		// Not overlapping
3485		if(x0 > internal.width) return;
3486		if(y0 > internal.height) return;
3487		if(x0 + width < 0) return;
3488		if(y0 + height < 0) return;
3489
3490		// Clip against dimensions
3491		if(x0 < 0) {width += x0; x0 = 0;}
3492		if(x0 + width > internal.width) width = internal.width - x0;
3493		if(y0 < 0) {height += y0; y0 = 0;}
3494		if(y0 + height > internal.height) height = internal.height - y0;
3495
3496		int x1 = x0 + width;
3497		int y1 = y0 + height;
3498
3499		int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
3500		int oddX1 = (x1 & ~1) * 2;
3501		int evenX0 = ((x0 + 1) & ~1) * 2;
3502		int evenBytes = oddX1 - evenX0;
3503
3504		unsigned char maskedS = s & mask;
3505		unsigned char invMask = ~mask;
3506		unsigned int fill = maskedS;
3507		fill = fill | (fill << 8) | (fill << 16) | (fill << 24);
3508
3509		char *buffer = (char*)lockStencil(0, 0, 0, PUBLIC);
3510
3511		// Stencil buffers are assumed to use quad layout
3512		for(int z = 0; z < stencil.samples; z++)
3513		{
3514			for(int y = y0; y < y1; y++)
3515			{
3516				char *target = buffer + (y & ~1) * stencil.pitchP + (y & 1) * 2;
3517
3518				if((y & 1) == 0 && y + 1 < y1 && mask == 0xFF)   // Fill quad line at once
3519				{
3520					if((x0 & 1) != 0)
3521					{
3522						target[oddX0 + 0] = fill;
3523						target[oddX0 + 2] = fill;
3524					}
3525
3526					memfill4(&target[evenX0], fill, evenBytes);
3527
3528					if((x1 & 1) != 0)
3529					{
3530						target[oddX1 + 0] = fill;
3531						target[oddX1 + 2] = fill;
3532					}
3533
3534					y++;
3535				}
3536				else
3537				{
3538					for(int x = x0; x < x1; x++)
3539					{
3540						int i = (x & ~1) * 2 + (x & 1);
3541						target[i] = maskedS | (target[i] & invMask);
3542					}
3543				}
3544			}
3545
3546			buffer += stencil.sliceP;
3547		}
3548
3549		unlockStencil();
3550	}
3551
3552	void Surface::fill(const Color<float> &color, int x0, int y0, int width, int height)
3553	{
3554		unsigned char *row;
3555		Buffer *buffer;
3556
3557		if(internal.dirty)
3558		{
3559			row = (unsigned char*)lockInternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3560			buffer = &internal;
3561		}
3562		else
3563		{
3564			row = (unsigned char*)lockExternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3565			buffer = &external;
3566		}
3567
3568		if(buffer->bytes <= 4)
3569		{
3570			int c;
3571			buffer->write(&c, color);
3572
3573			if(buffer->bytes <= 1) c = (c << 8)  | c;
3574			if(buffer->bytes <= 2) c = (c << 16) | c;
3575
3576			for(int y = 0; y < height; y++)
3577			{
3578				memfill4(row, c, width * buffer->bytes);
3579
3580				row += buffer->pitchB;
3581			}
3582		}
3583		else   // Generic
3584		{
3585			for(int y = 0; y < height; y++)
3586			{
3587				unsigned char *element = row;
3588
3589				for(int x = 0; x < width; x++)
3590				{
3591					buffer->write(element, color);
3592
3593					element += buffer->bytes;
3594				}
3595
3596				row += buffer->pitchB;
3597			}
3598		}
3599
3600		if(buffer == &internal)
3601		{
3602			unlockInternal();
3603		}
3604		else
3605		{
3606			unlockExternal();
3607		}
3608	}
3609
3610	void Surface::copyInternal(const Surface *source, int x, int y, float srcX, float srcY, bool filter)
3611	{
3612		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
3613
3614		sw::Color<float> color;
3615
3616		if(!filter)
3617		{
3618			color = source->internal.read((int)srcX, (int)srcY, 0);
3619		}
3620		else   // Bilinear filtering
3621		{
3622			color = source->internal.sample(srcX, srcY, 0);
3623		}
3624
3625		internal.write(x, y, color);
3626	}
3627
3628	void Surface::copyInternal(const Surface *source, int x, int y, int z, float srcX, float srcY, float srcZ, bool filter)
3629	{
3630		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
3631
3632		sw::Color<float> color;
3633
3634		if(!filter)
3635		{
3636			color = source->internal.read((int)srcX, (int)srcY, int(srcZ));
3637		}
3638		else   // Bilinear filtering
3639		{
3640			color = source->internal.sample(srcX, srcY, srcZ);
3641		}
3642
3643		internal.write(x, y, z, color);
3644	}
3645
3646	void Surface::copyCubeEdge(Edge dstEdge, Surface *src, Edge srcEdge)
3647	{
3648		Surface *dst = this;
3649
3650		// Figure out if the edges to be copied in reverse order respectively from one another
3651		// The copy should be reversed whenever the same edges are contiguous or if we're
3652		// copying top <-> right or bottom <-> left. This is explained by the layout, which is:
3653		//
3654		//      | +y |
3655		// | -x | +z | +x | -z |
3656		//      | -y |
3657
3658		bool reverse = (srcEdge == dstEdge) ||
3659		               ((srcEdge == TOP) && (dstEdge == RIGHT)) ||
3660		               ((srcEdge == RIGHT) && (dstEdge == TOP)) ||
3661		               ((srcEdge == BOTTOM) && (dstEdge == LEFT)) ||
3662		               ((srcEdge == LEFT) && (dstEdge == BOTTOM));
3663
3664		int srcBytes = src->bytes(src->Surface::getInternalFormat());
3665		int srcPitch = src->getInternalPitchB();
3666		int dstBytes = dst->bytes(dst->Surface::getInternalFormat());
3667		int dstPitch = dst->getInternalPitchB();
3668
3669		int srcW = src->getWidth();
3670		int srcH = src->getHeight();
3671		int dstW = dst->getWidth();
3672		int dstH = dst->getHeight();
3673
3674		ASSERT(srcW == srcH && dstW == dstH && srcW == dstW && srcBytes == dstBytes);
3675
3676		// Src is expressed in the regular [0, width-1], [0, height-1] space
3677		int srcDelta = ((srcEdge == TOP) || (srcEdge == BOTTOM)) ? srcBytes : srcPitch;
3678		int srcStart = ((srcEdge == BOTTOM) ? srcPitch * (srcH - 1) : ((srcEdge == RIGHT) ? srcBytes * (srcW - 1) : 0));
3679
3680		// Dst contains borders, so it is expressed in the [-1, width+1], [-1, height+1] space
3681		int dstDelta = (((dstEdge == TOP) || (dstEdge == BOTTOM)) ? dstBytes : dstPitch) * (reverse ? -1 : 1);
3682		int dstStart = ((dstEdge == BOTTOM) ? dstPitch * (dstH + 1) : ((dstEdge == RIGHT) ? dstBytes * (dstW + 1) : 0)) + (reverse ? dstW * -dstDelta : dstDelta);
3683
3684		char *srcBuf = (char*)src->lockInternal(0, 0, 0, sw::LOCK_READONLY, sw::PRIVATE) + srcStart;
3685		char *dstBuf = (char*)dst->lockInternal(-1, -1, 0, sw::LOCK_READWRITE, sw::PRIVATE) + dstStart;
3686
3687		for(int i = 0; i < srcW; ++i, dstBuf += dstDelta, srcBuf += srcDelta)
3688		{
3689			memcpy(dstBuf, srcBuf, srcBytes);
3690		}
3691
3692		if(dstEdge == LEFT || dstEdge == RIGHT)
3693		{
3694			// TOP and BOTTOM are already set, let's average out the corners
3695			int x0 = (dstEdge == RIGHT) ? dstW : -1;
3696			int y0 = -1;
3697			int x1 = (dstEdge == RIGHT) ? dstW - 1 : 0;
3698			int y1 = 0;
3699			dst->computeCubeCorner(x0, y0, x1, y1);
3700			y0 = dstH;
3701			y1 = dstH - 1;
3702			dst->computeCubeCorner(x0, y0, x1, y1);
3703		}
3704
3705		src->unlockInternal();
3706		dst->unlockInternal();
3707	}
3708
3709	void Surface::computeCubeCorner(int x0, int y0, int x1, int y1)
3710	{
3711		ASSERT(internal.lock != LOCK_UNLOCKED);
3712
3713		sw::Color<float> color = internal.read(x0, y1);
3714		color += internal.read(x1, y0);
3715		color += internal.read(x1, y1);
3716		color *= (1.0f / 3.0f);
3717
3718		internal.write(x0, y0, color);
3719	}
3720
3721	bool Surface::hasStencil() const
3722	{
3723		return isStencil(external.format);
3724	}
3725
3726	bool Surface::hasDepth() const
3727	{
3728		return isDepth(external.format);
3729	}
3730
3731	bool Surface::hasPalette() const
3732	{
3733		return isPalette(external.format);
3734	}
3735
3736	bool Surface::isRenderTarget() const
3737	{
3738		return renderTarget;
3739	}
3740
3741	bool Surface::hasDirtyContents() const
3742	{
3743		return dirtyContents;
3744	}
3745
3746	void Surface::markContentsClean()
3747	{
3748		dirtyContents = false;
3749	}
3750
3751	Resource *Surface::getResource()
3752	{
3753		return resource;
3754	}
3755
3756	bool Surface::identicalFormats() const
3757	{
3758		return external.format == internal.format &&
3759		       external.width  == internal.width &&
3760		       external.height == internal.height &&
3761		       external.depth  == internal.depth &&
3762		       external.pitchB == internal.pitchB &&
3763		       external.sliceB == internal.sliceB &&
3764		       external.border == internal.border &&
3765		       external.samples == internal.samples;
3766	}
3767
3768	Format Surface::selectInternalFormat(Format format) const
3769	{
3770		switch(format)
3771		{
3772		case FORMAT_NULL:
3773			return FORMAT_NULL;
3774		case FORMAT_P8:
3775		case FORMAT_A8P8:
3776		case FORMAT_A4R4G4B4:
3777		case FORMAT_A1R5G5B5:
3778		case FORMAT_A8R3G3B2:
3779			return FORMAT_A8R8G8B8;
3780		case FORMAT_A8:
3781			return FORMAT_A8;
3782		case FORMAT_R8I:
3783			return FORMAT_R8I;
3784		case FORMAT_R8UI:
3785			return FORMAT_R8UI;
3786		case FORMAT_R8_SNORM:
3787			return FORMAT_R8_SNORM;
3788		case FORMAT_R8:
3789			return FORMAT_R8;
3790		case FORMAT_R16I:
3791			return FORMAT_R16I;
3792		case FORMAT_R16UI:
3793			return FORMAT_R16UI;
3794		case FORMAT_R32I:
3795			return FORMAT_R32I;
3796		case FORMAT_R32UI:
3797			return FORMAT_R32UI;
3798		case FORMAT_X16B16G16R16I:
3799			return FORMAT_X16B16G16R16I;
3800		case FORMAT_A16B16G16R16I:
3801			return FORMAT_A16B16G16R16I;
3802		case FORMAT_X16B16G16R16UI:
3803			return FORMAT_X16B16G16R16UI;
3804		case FORMAT_A16B16G16R16UI:
3805			return FORMAT_A16B16G16R16UI;
3806		case FORMAT_A2R10G10B10:
3807		case FORMAT_A2B10G10R10:
3808		case FORMAT_A16B16G16R16:
3809			return FORMAT_A16B16G16R16;
3810		case FORMAT_A2B10G10R10UI:
3811			return FORMAT_A16B16G16R16UI;
3812		case FORMAT_X32B32G32R32I:
3813			return FORMAT_X32B32G32R32I;
3814		case FORMAT_A32B32G32R32I:
3815			return FORMAT_A32B32G32R32I;
3816		case FORMAT_X32B32G32R32UI:
3817			return FORMAT_X32B32G32R32UI;
3818		case FORMAT_A32B32G32R32UI:
3819			return FORMAT_A32B32G32R32UI;
3820		case FORMAT_G8R8I:
3821			return FORMAT_G8R8I;
3822		case FORMAT_G8R8UI:
3823			return FORMAT_G8R8UI;
3824		case FORMAT_G8R8_SNORM:
3825			return FORMAT_G8R8_SNORM;
3826		case FORMAT_G8R8:
3827			return FORMAT_G8R8;
3828		case FORMAT_G16R16I:
3829			return FORMAT_G16R16I;
3830		case FORMAT_G16R16UI:
3831			return FORMAT_G16R16UI;
3832		case FORMAT_G16R16:
3833			return FORMAT_G16R16;
3834		case FORMAT_G32R32I:
3835			return FORMAT_G32R32I;
3836		case FORMAT_G32R32UI:
3837			return FORMAT_G32R32UI;
3838		case FORMAT_A8R8G8B8:
3839			if(lockable || !quadLayoutEnabled)
3840			{
3841				return FORMAT_A8R8G8B8;
3842			}
3843			else
3844			{
3845				return FORMAT_A8G8R8B8Q;
3846			}
3847		case FORMAT_A8B8G8R8I:
3848			return FORMAT_A8B8G8R8I;
3849		case FORMAT_A8B8G8R8UI:
3850			return FORMAT_A8B8G8R8UI;
3851		case FORMAT_A8B8G8R8_SNORM:
3852			return FORMAT_A8B8G8R8_SNORM;
3853		case FORMAT_R5G5B5A1:
3854		case FORMAT_R4G4B4A4:
3855		case FORMAT_A8B8G8R8:
3856			return FORMAT_A8B8G8R8;
3857		case FORMAT_R5G6B5:
3858			return FORMAT_R5G6B5;
3859		case FORMAT_R3G3B2:
3860		case FORMAT_R8G8B8:
3861		case FORMAT_X4R4G4B4:
3862		case FORMAT_X1R5G5B5:
3863		case FORMAT_X8R8G8B8:
3864			if(lockable || !quadLayoutEnabled)
3865			{
3866				return FORMAT_X8R8G8B8;
3867			}
3868			else
3869			{
3870				return FORMAT_X8G8R8B8Q;
3871			}
3872		case FORMAT_X8B8G8R8I:
3873			return FORMAT_X8B8G8R8I;
3874		case FORMAT_X8B8G8R8UI:
3875			return FORMAT_X8B8G8R8UI;
3876		case FORMAT_X8B8G8R8_SNORM:
3877			return FORMAT_X8B8G8R8_SNORM;
3878		case FORMAT_B8G8R8:
3879		case FORMAT_X8B8G8R8:
3880			return FORMAT_X8B8G8R8;
3881		case FORMAT_SRGB8_X8:
3882			return FORMAT_SRGB8_X8;
3883		case FORMAT_SRGB8_A8:
3884			return FORMAT_SRGB8_A8;
3885		// Compressed formats
3886		case FORMAT_DXT1:
3887		case FORMAT_DXT3:
3888		case FORMAT_DXT5:
3889		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3890		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3891		case FORMAT_RGBA8_ETC2_EAC:
3892		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
3893		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
3894		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
3895		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
3896		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
3897		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
3898		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
3899		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
3900		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
3901		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
3902		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
3903		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
3904		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
3905		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
3906		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
3907			return FORMAT_A8R8G8B8;
3908		case FORMAT_RGBA_ASTC_4x4_KHR:
3909		case FORMAT_RGBA_ASTC_5x4_KHR:
3910		case FORMAT_RGBA_ASTC_5x5_KHR:
3911		case FORMAT_RGBA_ASTC_6x5_KHR:
3912		case FORMAT_RGBA_ASTC_6x6_KHR:
3913		case FORMAT_RGBA_ASTC_8x5_KHR:
3914		case FORMAT_RGBA_ASTC_8x6_KHR:
3915		case FORMAT_RGBA_ASTC_8x8_KHR:
3916		case FORMAT_RGBA_ASTC_10x5_KHR:
3917		case FORMAT_RGBA_ASTC_10x6_KHR:
3918		case FORMAT_RGBA_ASTC_10x8_KHR:
3919		case FORMAT_RGBA_ASTC_10x10_KHR:
3920		case FORMAT_RGBA_ASTC_12x10_KHR:
3921		case FORMAT_RGBA_ASTC_12x12_KHR:
3922			// ASTC supports HDR, so a floating point format is required to represent it properly
3923			return FORMAT_A32B32G32R32F; // FIXME: 16FP is probably sufficient, but it's currently unsupported
3924		case FORMAT_ATI1:
3925			return FORMAT_R8;
3926		case FORMAT_R11_EAC:
3927		case FORMAT_SIGNED_R11_EAC:
3928			return FORMAT_R32F; // FIXME: Signed 8bit format would be sufficient
3929		case FORMAT_ATI2:
3930			return FORMAT_G8R8;
3931		case FORMAT_RG11_EAC:
3932		case FORMAT_SIGNED_RG11_EAC:
3933			return FORMAT_G32R32F; // FIXME: Signed 8bit format would be sufficient
3934		case FORMAT_ETC1:
3935		case FORMAT_RGB8_ETC2:
3936		case FORMAT_SRGB8_ETC2:
3937			return FORMAT_X8R8G8B8;
3938		// Bumpmap formats
3939		case FORMAT_V8U8:			return FORMAT_V8U8;
3940		case FORMAT_L6V5U5:			return FORMAT_X8L8V8U8;
3941		case FORMAT_Q8W8V8U8:		return FORMAT_Q8W8V8U8;
3942		case FORMAT_X8L8V8U8:		return FORMAT_X8L8V8U8;
3943		case FORMAT_V16U16:			return FORMAT_V16U16;
3944		case FORMAT_A2W10V10U10:	return FORMAT_A16W16V16U16;
3945		case FORMAT_Q16W16V16U16:	return FORMAT_Q16W16V16U16;
3946		// Floating-point formats
3947		case FORMAT_A16F:			return FORMAT_A32B32G32R32F;
3948		case FORMAT_R16F:			return FORMAT_R32F;
3949		case FORMAT_G16R16F:		return FORMAT_G32R32F;
3950		case FORMAT_B16G16R16F:     return FORMAT_X32B32G32R32F;
3951		case FORMAT_X16B16G16R16F:	return FORMAT_X32B32G32R32F;
3952		case FORMAT_A16B16G16R16F:	return FORMAT_A32B32G32R32F;
3953		case FORMAT_X16B16G16R16F_UNSIGNED: return FORMAT_X32B32G32R32F_UNSIGNED;
3954		case FORMAT_A32F:			return FORMAT_A32B32G32R32F;
3955		case FORMAT_R32F:			return FORMAT_R32F;
3956		case FORMAT_G32R32F:		return FORMAT_G32R32F;
3957		case FORMAT_B32G32R32F:     return FORMAT_X32B32G32R32F;
3958		case FORMAT_X32B32G32R32F:  return FORMAT_X32B32G32R32F;
3959		case FORMAT_A32B32G32R32F:	return FORMAT_A32B32G32R32F;
3960		case FORMAT_X32B32G32R32F_UNSIGNED: return FORMAT_X32B32G32R32F_UNSIGNED;
3961		// Luminance formats
3962		case FORMAT_L8:				return FORMAT_L8;
3963		case FORMAT_A4L4:			return FORMAT_A8L8;
3964		case FORMAT_L16:			return FORMAT_L16;
3965		case FORMAT_A8L8:			return FORMAT_A8L8;
3966		case FORMAT_L16F:           return FORMAT_X32B32G32R32F;
3967		case FORMAT_A16L16F:        return FORMAT_A32B32G32R32F;
3968		case FORMAT_L32F:           return FORMAT_X32B32G32R32F;
3969		case FORMAT_A32L32F:        return FORMAT_A32B32G32R32F;
3970		// Depth/stencil formats
3971		case FORMAT_D16:
3972		case FORMAT_D32:
3973		case FORMAT_D24X8:
3974			if(hasParent)   // Texture
3975			{
3976				return FORMAT_D32F_SHADOW;
3977			}
3978			else if(complementaryDepthBuffer)
3979			{
3980				return FORMAT_D32F_COMPLEMENTARY;
3981			}
3982			else
3983			{
3984				return FORMAT_D32F;
3985			}
3986		case FORMAT_D24S8:
3987		case FORMAT_D24FS8:
3988			if(hasParent)   // Texture
3989			{
3990				return FORMAT_D32FS8_SHADOW;
3991			}
3992			else if(complementaryDepthBuffer)
3993			{
3994				return FORMAT_D32FS8_COMPLEMENTARY;
3995			}
3996			else
3997			{
3998				return FORMAT_D32FS8;
3999			}
4000		case FORMAT_D32F:           return FORMAT_D32F;
4001		case FORMAT_D32FS8:         return FORMAT_D32FS8;
4002		case FORMAT_D32F_LOCKABLE:  return FORMAT_D32F_LOCKABLE;
4003		case FORMAT_D32FS8_TEXTURE: return FORMAT_D32FS8_TEXTURE;
4004		case FORMAT_INTZ:           return FORMAT_D32FS8_TEXTURE;
4005		case FORMAT_DF24S8:         return FORMAT_D32FS8_SHADOW;
4006		case FORMAT_DF16S8:         return FORMAT_D32FS8_SHADOW;
4007		case FORMAT_S8:             return FORMAT_S8;
4008		// YUV formats
4009		case FORMAT_YV12_BT601:     return FORMAT_YV12_BT601;
4010		case FORMAT_YV12_BT709:     return FORMAT_YV12_BT709;
4011		case FORMAT_YV12_JFIF:      return FORMAT_YV12_JFIF;
4012		default:
4013			ASSERT(false);
4014		}
4015
4016		return FORMAT_NULL;
4017	}
4018
4019	void Surface::setTexturePalette(unsigned int *palette)
4020	{
4021		Surface::palette = palette;
4022		Surface::paletteID++;
4023	}
4024
4025	void Surface::resolve()
4026	{
4027		if(internal.samples <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL)
4028		{
4029			return;
4030		}
4031
4032		ASSERT(internal.depth == 1);  // Unimplemented
4033
4034		void *source = internal.lockRect(0, 0, 0, LOCK_READWRITE);
4035
4036		int width = internal.width;
4037		int height = internal.height;
4038		int pitch = internal.pitchB;
4039		int slice = internal.sliceB;
4040
4041		unsigned char *source0 = (unsigned char*)source;
4042		unsigned char *source1 = source0 + slice;
4043		unsigned char *source2 = source1 + slice;
4044		unsigned char *source3 = source2 + slice;
4045		unsigned char *source4 = source3 + slice;
4046		unsigned char *source5 = source4 + slice;
4047		unsigned char *source6 = source5 + slice;
4048		unsigned char *source7 = source6 + slice;
4049		unsigned char *source8 = source7 + slice;
4050		unsigned char *source9 = source8 + slice;
4051		unsigned char *sourceA = source9 + slice;
4052		unsigned char *sourceB = sourceA + slice;
4053		unsigned char *sourceC = sourceB + slice;
4054		unsigned char *sourceD = sourceC + slice;
4055		unsigned char *sourceE = sourceD + slice;
4056		unsigned char *sourceF = sourceE + slice;
4057
4058		if(internal.format == FORMAT_X8R8G8B8 || internal.format == FORMAT_A8R8G8B8 ||
4059		   internal.format == FORMAT_X8B8G8R8 || internal.format == FORMAT_A8B8G8R8 ||
4060		   internal.format == FORMAT_SRGB8_X8 || internal.format == FORMAT_SRGB8_A8)
4061		{
4062			#if defined(__i386__) || defined(__x86_64__)
4063				if(CPUID::supportsSSE2() && (width % 4) == 0)
4064				{
4065					if(internal.samples == 2)
4066					{
4067						for(int y = 0; y < height; y++)
4068						{
4069							for(int x = 0; x < width; x += 4)
4070							{
4071								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4072								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4073
4074								c0 = _mm_avg_epu8(c0, c1);
4075
4076								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4077							}
4078
4079							source0 += pitch;
4080							source1 += pitch;
4081						}
4082					}
4083					else if(internal.samples == 4)
4084					{
4085						for(int y = 0; y < height; y++)
4086						{
4087							for(int x = 0; x < width; x += 4)
4088							{
4089								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4090								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4091								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4092								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4093
4094								c0 = _mm_avg_epu8(c0, c1);
4095								c2 = _mm_avg_epu8(c2, c3);
4096								c0 = _mm_avg_epu8(c0, c2);
4097
4098								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4099							}
4100
4101							source0 += pitch;
4102							source1 += pitch;
4103							source2 += pitch;
4104							source3 += pitch;
4105						}
4106					}
4107					else if(internal.samples == 8)
4108					{
4109						for(int y = 0; y < height; y++)
4110						{
4111							for(int x = 0; x < width; x += 4)
4112							{
4113								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4114								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4115								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4116								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4117								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4118								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4119								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4120								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4121
4122								c0 = _mm_avg_epu8(c0, c1);
4123								c2 = _mm_avg_epu8(c2, c3);
4124								c4 = _mm_avg_epu8(c4, c5);
4125								c6 = _mm_avg_epu8(c6, c7);
4126								c0 = _mm_avg_epu8(c0, c2);
4127								c4 = _mm_avg_epu8(c4, c6);
4128								c0 = _mm_avg_epu8(c0, c4);
4129
4130								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4131							}
4132
4133							source0 += pitch;
4134							source1 += pitch;
4135							source2 += pitch;
4136							source3 += pitch;
4137							source4 += pitch;
4138							source5 += pitch;
4139							source6 += pitch;
4140							source7 += pitch;
4141						}
4142					}
4143					else if(internal.samples == 16)
4144					{
4145						for(int y = 0; y < height; y++)
4146						{
4147							for(int x = 0; x < width; x += 4)
4148							{
4149								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4150								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4151								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4152								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4153								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4154								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4155								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4156								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4157								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
4158								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
4159								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
4160								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
4161								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
4162								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
4163								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
4164								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
4165
4166								c0 = _mm_avg_epu8(c0, c1);
4167								c2 = _mm_avg_epu8(c2, c3);
4168								c4 = _mm_avg_epu8(c4, c5);
4169								c6 = _mm_avg_epu8(c6, c7);
4170								c8 = _mm_avg_epu8(c8, c9);
4171								cA = _mm_avg_epu8(cA, cB);
4172								cC = _mm_avg_epu8(cC, cD);
4173								cE = _mm_avg_epu8(cE, cF);
4174								c0 = _mm_avg_epu8(c0, c2);
4175								c4 = _mm_avg_epu8(c4, c6);
4176								c8 = _mm_avg_epu8(c8, cA);
4177								cC = _mm_avg_epu8(cC, cE);
4178								c0 = _mm_avg_epu8(c0, c4);
4179								c8 = _mm_avg_epu8(c8, cC);
4180								c0 = _mm_avg_epu8(c0, c8);
4181
4182								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4183							}
4184
4185							source0 += pitch;
4186							source1 += pitch;
4187							source2 += pitch;
4188							source3 += pitch;
4189							source4 += pitch;
4190							source5 += pitch;
4191							source6 += pitch;
4192							source7 += pitch;
4193							source8 += pitch;
4194							source9 += pitch;
4195							sourceA += pitch;
4196							sourceB += pitch;
4197							sourceC += pitch;
4198							sourceD += pitch;
4199							sourceE += pitch;
4200							sourceF += pitch;
4201						}
4202					}
4203					else ASSERT(false);
4204				}
4205				else
4206			#endif
4207			{
4208				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))
4209
4210				if(internal.samples == 2)
4211				{
4212					for(int y = 0; y < height; y++)
4213					{
4214						for(int x = 0; x < width; x++)
4215						{
4216							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4217							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4218
4219							c0 = AVERAGE(c0, c1);
4220
4221							*(unsigned int*)(source0 + 4 * x) = c0;
4222						}
4223
4224						source0 += pitch;
4225						source1 += pitch;
4226					}
4227				}
4228				else if(internal.samples == 4)
4229				{
4230					for(int y = 0; y < height; y++)
4231					{
4232						for(int x = 0; x < width; x++)
4233						{
4234							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4235							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4236							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4237							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4238
4239							c0 = AVERAGE(c0, c1);
4240							c2 = AVERAGE(c2, c3);
4241							c0 = AVERAGE(c0, c2);
4242
4243							*(unsigned int*)(source0 + 4 * x) = c0;
4244						}
4245
4246						source0 += pitch;
4247						source1 += pitch;
4248						source2 += pitch;
4249						source3 += pitch;
4250					}
4251				}
4252				else if(internal.samples == 8)
4253				{
4254					for(int y = 0; y < height; y++)
4255					{
4256						for(int x = 0; x < width; x++)
4257						{
4258							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4259							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4260							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4261							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4262							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4263							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4264							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4265							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4266
4267							c0 = AVERAGE(c0, c1);
4268							c2 = AVERAGE(c2, c3);
4269							c4 = AVERAGE(c4, c5);
4270							c6 = AVERAGE(c6, c7);
4271							c0 = AVERAGE(c0, c2);
4272							c4 = AVERAGE(c4, c6);
4273							c0 = AVERAGE(c0, c4);
4274
4275							*(unsigned int*)(source0 + 4 * x) = c0;
4276						}
4277
4278						source0 += pitch;
4279						source1 += pitch;
4280						source2 += pitch;
4281						source3 += pitch;
4282						source4 += pitch;
4283						source5 += pitch;
4284						source6 += pitch;
4285						source7 += pitch;
4286					}
4287				}
4288				else if(internal.samples == 16)
4289				{
4290					for(int y = 0; y < height; y++)
4291					{
4292						for(int x = 0; x < width; x++)
4293						{
4294							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4295							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4296							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4297							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4298							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4299							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4300							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4301							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4302							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4303							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4304							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4305							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4306							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4307							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4308							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4309							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4310
4311							c0 = AVERAGE(c0, c1);
4312							c2 = AVERAGE(c2, c3);
4313							c4 = AVERAGE(c4, c5);
4314							c6 = AVERAGE(c6, c7);
4315							c8 = AVERAGE(c8, c9);
4316							cA = AVERAGE(cA, cB);
4317							cC = AVERAGE(cC, cD);
4318							cE = AVERAGE(cE, cF);
4319							c0 = AVERAGE(c0, c2);
4320							c4 = AVERAGE(c4, c6);
4321							c8 = AVERAGE(c8, cA);
4322							cC = AVERAGE(cC, cE);
4323							c0 = AVERAGE(c0, c4);
4324							c8 = AVERAGE(c8, cC);
4325							c0 = AVERAGE(c0, c8);
4326
4327							*(unsigned int*)(source0 + 4 * x) = c0;
4328						}
4329
4330						source0 += pitch;
4331						source1 += pitch;
4332						source2 += pitch;
4333						source3 += pitch;
4334						source4 += pitch;
4335						source5 += pitch;
4336						source6 += pitch;
4337						source7 += pitch;
4338						source8 += pitch;
4339						source9 += pitch;
4340						sourceA += pitch;
4341						sourceB += pitch;
4342						sourceC += pitch;
4343						sourceD += pitch;
4344						sourceE += pitch;
4345						sourceF += pitch;
4346					}
4347				}
4348				else ASSERT(false);
4349
4350				#undef AVERAGE
4351			}
4352		}
4353		else if(internal.format == FORMAT_G16R16)
4354		{
4355
4356			#if defined(__i386__) || defined(__x86_64__)
4357				if(CPUID::supportsSSE2() && (width % 4) == 0)
4358				{
4359					if(internal.samples == 2)
4360					{
4361						for(int y = 0; y < height; y++)
4362						{
4363							for(int x = 0; x < width; x += 4)
4364							{
4365								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4366								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4367
4368								c0 = _mm_avg_epu16(c0, c1);
4369
4370								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4371							}
4372
4373							source0 += pitch;
4374							source1 += pitch;
4375						}
4376					}
4377					else if(internal.samples == 4)
4378					{
4379						for(int y = 0; y < height; y++)
4380						{
4381							for(int x = 0; x < width; x += 4)
4382							{
4383								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4384								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4385								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4386								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4387
4388								c0 = _mm_avg_epu16(c0, c1);
4389								c2 = _mm_avg_epu16(c2, c3);
4390								c0 = _mm_avg_epu16(c0, c2);
4391
4392								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4393							}
4394
4395							source0 += pitch;
4396							source1 += pitch;
4397							source2 += pitch;
4398							source3 += pitch;
4399						}
4400					}
4401					else if(internal.samples == 8)
4402					{
4403						for(int y = 0; y < height; y++)
4404						{
4405							for(int x = 0; x < width; x += 4)
4406							{
4407								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4408								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4409								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4410								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4411								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4412								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4413								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4414								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4415
4416								c0 = _mm_avg_epu16(c0, c1);
4417								c2 = _mm_avg_epu16(c2, c3);
4418								c4 = _mm_avg_epu16(c4, c5);
4419								c6 = _mm_avg_epu16(c6, c7);
4420								c0 = _mm_avg_epu16(c0, c2);
4421								c4 = _mm_avg_epu16(c4, c6);
4422								c0 = _mm_avg_epu16(c0, c4);
4423
4424								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4425							}
4426
4427							source0 += pitch;
4428							source1 += pitch;
4429							source2 += pitch;
4430							source3 += pitch;
4431							source4 += pitch;
4432							source5 += pitch;
4433							source6 += pitch;
4434							source7 += pitch;
4435						}
4436					}
4437					else if(internal.samples == 16)
4438					{
4439						for(int y = 0; y < height; y++)
4440						{
4441							for(int x = 0; x < width; x += 4)
4442							{
4443								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4444								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4445								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4446								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4447								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4448								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4449								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4450								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4451								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
4452								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
4453								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
4454								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
4455								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
4456								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
4457								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
4458								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
4459
4460								c0 = _mm_avg_epu16(c0, c1);
4461								c2 = _mm_avg_epu16(c2, c3);
4462								c4 = _mm_avg_epu16(c4, c5);
4463								c6 = _mm_avg_epu16(c6, c7);
4464								c8 = _mm_avg_epu16(c8, c9);
4465								cA = _mm_avg_epu16(cA, cB);
4466								cC = _mm_avg_epu16(cC, cD);
4467								cE = _mm_avg_epu16(cE, cF);
4468								c0 = _mm_avg_epu16(c0, c2);
4469								c4 = _mm_avg_epu16(c4, c6);
4470								c8 = _mm_avg_epu16(c8, cA);
4471								cC = _mm_avg_epu16(cC, cE);
4472								c0 = _mm_avg_epu16(c0, c4);
4473								c8 = _mm_avg_epu16(c8, cC);
4474								c0 = _mm_avg_epu16(c0, c8);
4475
4476								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4477							}
4478
4479							source0 += pitch;
4480							source1 += pitch;
4481							source2 += pitch;
4482							source3 += pitch;
4483							source4 += pitch;
4484							source5 += pitch;
4485							source6 += pitch;
4486							source7 += pitch;
4487							source8 += pitch;
4488							source9 += pitch;
4489							sourceA += pitch;
4490							sourceB += pitch;
4491							sourceC += pitch;
4492							sourceD += pitch;
4493							sourceE += pitch;
4494							sourceF += pitch;
4495						}
4496					}
4497					else ASSERT(false);
4498				}
4499				else
4500			#endif
4501			{
4502				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
4503
4504				if(internal.samples == 2)
4505				{
4506					for(int y = 0; y < height; y++)
4507					{
4508						for(int x = 0; x < width; x++)
4509						{
4510							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4511							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4512
4513							c0 = AVERAGE(c0, c1);
4514
4515							*(unsigned int*)(source0 + 4 * x) = c0;
4516						}
4517
4518						source0 += pitch;
4519						source1 += pitch;
4520					}
4521				}
4522				else if(internal.samples == 4)
4523				{
4524					for(int y = 0; y < height; y++)
4525					{
4526						for(int x = 0; x < width; x++)
4527						{
4528							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4529							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4530							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4531							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4532
4533							c0 = AVERAGE(c0, c1);
4534							c2 = AVERAGE(c2, c3);
4535							c0 = AVERAGE(c0, c2);
4536
4537							*(unsigned int*)(source0 + 4 * x) = c0;
4538						}
4539
4540						source0 += pitch;
4541						source1 += pitch;
4542						source2 += pitch;
4543						source3 += pitch;
4544					}
4545				}
4546				else if(internal.samples == 8)
4547				{
4548					for(int y = 0; y < height; y++)
4549					{
4550						for(int x = 0; x < width; x++)
4551						{
4552							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4553							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4554							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4555							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4556							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4557							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4558							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4559							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4560
4561							c0 = AVERAGE(c0, c1);
4562							c2 = AVERAGE(c2, c3);
4563							c4 = AVERAGE(c4, c5);
4564							c6 = AVERAGE(c6, c7);
4565							c0 = AVERAGE(c0, c2);
4566							c4 = AVERAGE(c4, c6);
4567							c0 = AVERAGE(c0, c4);
4568
4569							*(unsigned int*)(source0 + 4 * x) = c0;
4570						}
4571
4572						source0 += pitch;
4573						source1 += pitch;
4574						source2 += pitch;
4575						source3 += pitch;
4576						source4 += pitch;
4577						source5 += pitch;
4578						source6 += pitch;
4579						source7 += pitch;
4580					}
4581				}
4582				else if(internal.samples == 16)
4583				{
4584					for(int y = 0; y < height; y++)
4585					{
4586						for(int x = 0; x < width; x++)
4587						{
4588							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4589							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4590							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4591							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4592							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4593							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4594							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4595							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4596							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4597							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4598							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4599							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4600							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4601							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4602							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4603							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4604
4605							c0 = AVERAGE(c0, c1);
4606							c2 = AVERAGE(c2, c3);
4607							c4 = AVERAGE(c4, c5);
4608							c6 = AVERAGE(c6, c7);
4609							c8 = AVERAGE(c8, c9);
4610							cA = AVERAGE(cA, cB);
4611							cC = AVERAGE(cC, cD);
4612							cE = AVERAGE(cE, cF);
4613							c0 = AVERAGE(c0, c2);
4614							c4 = AVERAGE(c4, c6);
4615							c8 = AVERAGE(c8, cA);
4616							cC = AVERAGE(cC, cE);
4617							c0 = AVERAGE(c0, c4);
4618							c8 = AVERAGE(c8, cC);
4619							c0 = AVERAGE(c0, c8);
4620
4621							*(unsigned int*)(source0 + 4 * x) = c0;
4622						}
4623
4624						source0 += pitch;
4625						source1 += pitch;
4626						source2 += pitch;
4627						source3 += pitch;
4628						source4 += pitch;
4629						source5 += pitch;
4630						source6 += pitch;
4631						source7 += pitch;
4632						source8 += pitch;
4633						source9 += pitch;
4634						sourceA += pitch;
4635						sourceB += pitch;
4636						sourceC += pitch;
4637						sourceD += pitch;
4638						sourceE += pitch;
4639						sourceF += pitch;
4640					}
4641				}
4642				else ASSERT(false);
4643
4644				#undef AVERAGE
4645			}
4646		}
4647		else if(internal.format == FORMAT_A16B16G16R16)
4648		{
4649			#if defined(__i386__) || defined(__x86_64__)
4650				if(CPUID::supportsSSE2() && (width % 2) == 0)
4651				{
4652					if(internal.samples == 2)
4653					{
4654						for(int y = 0; y < height; y++)
4655						{
4656							for(int x = 0; x < width; x += 2)
4657							{
4658								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4659								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4660
4661								c0 = _mm_avg_epu16(c0, c1);
4662
4663								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4664							}
4665
4666							source0 += pitch;
4667							source1 += pitch;
4668						}
4669					}
4670					else if(internal.samples == 4)
4671					{
4672						for(int y = 0; y < height; y++)
4673						{
4674							for(int x = 0; x < width; x += 2)
4675							{
4676								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4677								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4678								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4679								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4680
4681								c0 = _mm_avg_epu16(c0, c1);
4682								c2 = _mm_avg_epu16(c2, c3);
4683								c0 = _mm_avg_epu16(c0, c2);
4684
4685								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4686							}
4687
4688							source0 += pitch;
4689							source1 += pitch;
4690							source2 += pitch;
4691							source3 += pitch;
4692						}
4693					}
4694					else if(internal.samples == 8)
4695					{
4696						for(int y = 0; y < height; y++)
4697						{
4698							for(int x = 0; x < width; x += 2)
4699							{
4700								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4701								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4702								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4703								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4704								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4705								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4706								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4707								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4708
4709								c0 = _mm_avg_epu16(c0, c1);
4710								c2 = _mm_avg_epu16(c2, c3);
4711								c4 = _mm_avg_epu16(c4, c5);
4712								c6 = _mm_avg_epu16(c6, c7);
4713								c0 = _mm_avg_epu16(c0, c2);
4714								c4 = _mm_avg_epu16(c4, c6);
4715								c0 = _mm_avg_epu16(c0, c4);
4716
4717								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4718							}
4719
4720							source0 += pitch;
4721							source1 += pitch;
4722							source2 += pitch;
4723							source3 += pitch;
4724							source4 += pitch;
4725							source5 += pitch;
4726							source6 += pitch;
4727							source7 += pitch;
4728						}
4729					}
4730					else if(internal.samples == 16)
4731					{
4732						for(int y = 0; y < height; y++)
4733						{
4734							for(int x = 0; x < width; x += 2)
4735							{
4736								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4737								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4738								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4739								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4740								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4741								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4742								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4743								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4744								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
4745								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
4746								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
4747								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
4748								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
4749								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
4750								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
4751								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
4752
4753								c0 = _mm_avg_epu16(c0, c1);
4754								c2 = _mm_avg_epu16(c2, c3);
4755								c4 = _mm_avg_epu16(c4, c5);
4756								c6 = _mm_avg_epu16(c6, c7);
4757								c8 = _mm_avg_epu16(c8, c9);
4758								cA = _mm_avg_epu16(cA, cB);
4759								cC = _mm_avg_epu16(cC, cD);
4760								cE = _mm_avg_epu16(cE, cF);
4761								c0 = _mm_avg_epu16(c0, c2);
4762								c4 = _mm_avg_epu16(c4, c6);
4763								c8 = _mm_avg_epu16(c8, cA);
4764								cC = _mm_avg_epu16(cC, cE);
4765								c0 = _mm_avg_epu16(c0, c4);
4766								c8 = _mm_avg_epu16(c8, cC);
4767								c0 = _mm_avg_epu16(c0, c8);
4768
4769								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4770							}
4771
4772							source0 += pitch;
4773							source1 += pitch;
4774							source2 += pitch;
4775							source3 += pitch;
4776							source4 += pitch;
4777							source5 += pitch;
4778							source6 += pitch;
4779							source7 += pitch;
4780							source8 += pitch;
4781							source9 += pitch;
4782							sourceA += pitch;
4783							sourceB += pitch;
4784							sourceC += pitch;
4785							sourceD += pitch;
4786							sourceE += pitch;
4787							sourceF += pitch;
4788						}
4789					}
4790					else ASSERT(false);
4791				}
4792				else
4793			#endif
4794			{
4795				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
4796
4797				if(internal.samples == 2)
4798				{
4799					for(int y = 0; y < height; y++)
4800					{
4801						for(int x = 0; x < 2 * width; x++)
4802						{
4803							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4804							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4805
4806							c0 = AVERAGE(c0, c1);
4807
4808							*(unsigned int*)(source0 + 4 * x) = c0;
4809						}
4810
4811						source0 += pitch;
4812						source1 += pitch;
4813					}
4814				}
4815				else if(internal.samples == 4)
4816				{
4817					for(int y = 0; y < height; y++)
4818					{
4819						for(int x = 0; x < 2 * width; x++)
4820						{
4821							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4822							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4823							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4824							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4825
4826							c0 = AVERAGE(c0, c1);
4827							c2 = AVERAGE(c2, c3);
4828							c0 = AVERAGE(c0, c2);
4829
4830							*(unsigned int*)(source0 + 4 * x) = c0;
4831						}
4832
4833						source0 += pitch;
4834						source1 += pitch;
4835						source2 += pitch;
4836						source3 += pitch;
4837					}
4838				}
4839				else if(internal.samples == 8)
4840				{
4841					for(int y = 0; y < height; y++)
4842					{
4843						for(int x = 0; x < 2 * width; x++)
4844						{
4845							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4846							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4847							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4848							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4849							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4850							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4851							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4852							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4853
4854							c0 = AVERAGE(c0, c1);
4855							c2 = AVERAGE(c2, c3);
4856							c4 = AVERAGE(c4, c5);
4857							c6 = AVERAGE(c6, c7);
4858							c0 = AVERAGE(c0, c2);
4859							c4 = AVERAGE(c4, c6);
4860							c0 = AVERAGE(c0, c4);
4861
4862							*(unsigned int*)(source0 + 4 * x) = c0;
4863						}
4864
4865						source0 += pitch;
4866						source1 += pitch;
4867						source2 += pitch;
4868						source3 += pitch;
4869						source4 += pitch;
4870						source5 += pitch;
4871						source6 += pitch;
4872						source7 += pitch;
4873					}
4874				}
4875				else if(internal.samples == 16)
4876				{
4877					for(int y = 0; y < height; y++)
4878					{
4879						for(int x = 0; x < 2 * width; x++)
4880						{
4881							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4882							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4883							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4884							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4885							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4886							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4887							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4888							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4889							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4890							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4891							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4892							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4893							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4894							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4895							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4896							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4897
4898							c0 = AVERAGE(c0, c1);
4899							c2 = AVERAGE(c2, c3);
4900							c4 = AVERAGE(c4, c5);
4901							c6 = AVERAGE(c6, c7);
4902							c8 = AVERAGE(c8, c9);
4903							cA = AVERAGE(cA, cB);
4904							cC = AVERAGE(cC, cD);
4905							cE = AVERAGE(cE, cF);
4906							c0 = AVERAGE(c0, c2);
4907							c4 = AVERAGE(c4, c6);
4908							c8 = AVERAGE(c8, cA);
4909							cC = AVERAGE(cC, cE);
4910							c0 = AVERAGE(c0, c4);
4911							c8 = AVERAGE(c8, cC);
4912							c0 = AVERAGE(c0, c8);
4913
4914							*(unsigned int*)(source0 + 4 * x) = c0;
4915						}
4916
4917						source0 += pitch;
4918						source1 += pitch;
4919						source2 += pitch;
4920						source3 += pitch;
4921						source4 += pitch;
4922						source5 += pitch;
4923						source6 += pitch;
4924						source7 += pitch;
4925						source8 += pitch;
4926						source9 += pitch;
4927						sourceA += pitch;
4928						sourceB += pitch;
4929						sourceC += pitch;
4930						sourceD += pitch;
4931						sourceE += pitch;
4932						sourceF += pitch;
4933					}
4934				}
4935				else ASSERT(false);
4936
4937				#undef AVERAGE
4938			}
4939		}
4940		else if(internal.format == FORMAT_R32F)
4941		{
4942			#if defined(__i386__) || defined(__x86_64__)
4943				if(CPUID::supportsSSE() && (width % 4) == 0)
4944				{
4945					if(internal.samples == 2)
4946					{
4947						for(int y = 0; y < height; y++)
4948						{
4949							for(int x = 0; x < width; x += 4)
4950							{
4951								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4952								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4953
4954								c0 = _mm_add_ps(c0, c1);
4955								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
4956
4957								_mm_store_ps((float*)(source0 + 4 * x), c0);
4958							}
4959
4960							source0 += pitch;
4961							source1 += pitch;
4962						}
4963					}
4964					else if(internal.samples == 4)
4965					{
4966						for(int y = 0; y < height; y++)
4967						{
4968							for(int x = 0; x < width; x += 4)
4969							{
4970								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4971								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4972								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4973								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4974
4975								c0 = _mm_add_ps(c0, c1);
4976								c2 = _mm_add_ps(c2, c3);
4977								c0 = _mm_add_ps(c0, c2);
4978								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
4979
4980								_mm_store_ps((float*)(source0 + 4 * x), c0);
4981							}
4982
4983							source0 += pitch;
4984							source1 += pitch;
4985							source2 += pitch;
4986							source3 += pitch;
4987						}
4988					}
4989					else if(internal.samples == 8)
4990					{
4991						for(int y = 0; y < height; y++)
4992						{
4993							for(int x = 0; x < width; x += 4)
4994							{
4995								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4996								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4997								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4998								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4999								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
5000								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
5001								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
5002								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
5003
5004								c0 = _mm_add_ps(c0, c1);
5005								c2 = _mm_add_ps(c2, c3);
5006								c4 = _mm_add_ps(c4, c5);
5007								c6 = _mm_add_ps(c6, c7);
5008								c0 = _mm_add_ps(c0, c2);
5009								c4 = _mm_add_ps(c4, c6);
5010								c0 = _mm_add_ps(c0, c4);
5011								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
5012
5013								_mm_store_ps((float*)(source0 + 4 * x), c0);
5014							}
5015
5016							source0 += pitch;
5017							source1 += pitch;
5018							source2 += pitch;
5019							source3 += pitch;
5020							source4 += pitch;
5021							source5 += pitch;
5022							source6 += pitch;
5023							source7 += pitch;
5024						}
5025					}
5026					else if(internal.samples == 16)
5027					{
5028						for(int y = 0; y < height; y++)
5029						{
5030							for(int x = 0; x < width; x += 4)
5031							{
5032								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
5033								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
5034								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
5035								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
5036								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
5037								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
5038								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
5039								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
5040								__m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
5041								__m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
5042								__m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
5043								__m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
5044								__m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
5045								__m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
5046								__m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
5047								__m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
5048
5049								c0 = _mm_add_ps(c0, c1);
5050								c2 = _mm_add_ps(c2, c3);
5051								c4 = _mm_add_ps(c4, c5);
5052								c6 = _mm_add_ps(c6, c7);
5053								c8 = _mm_add_ps(c8, c9);
5054								cA = _mm_add_ps(cA, cB);
5055								cC = _mm_add_ps(cC, cD);
5056								cE = _mm_add_ps(cE, cF);
5057								c0 = _mm_add_ps(c0, c2);
5058								c4 = _mm_add_ps(c4, c6);
5059								c8 = _mm_add_ps(c8, cA);
5060								cC = _mm_add_ps(cC, cE);
5061								c0 = _mm_add_ps(c0, c4);
5062								c8 = _mm_add_ps(c8, cC);
5063								c0 = _mm_add_ps(c0, c8);
5064								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5065
5066								_mm_store_ps((float*)(source0 + 4 * x), c0);
5067							}
5068
5069							source0 += pitch;
5070							source1 += pitch;
5071							source2 += pitch;
5072							source3 += pitch;
5073							source4 += pitch;
5074							source5 += pitch;
5075							source6 += pitch;
5076							source7 += pitch;
5077							source8 += pitch;
5078							source9 += pitch;
5079							sourceA += pitch;
5080							sourceB += pitch;
5081							sourceC += pitch;
5082							sourceD += pitch;
5083							sourceE += pitch;
5084							sourceF += pitch;
5085						}
5086					}
5087					else ASSERT(false);
5088				}
5089				else
5090			#endif
5091			{
5092				if(internal.samples == 2)
5093				{
5094					for(int y = 0; y < height; y++)
5095					{
5096						for(int x = 0; x < width; x++)
5097						{
5098							float c0 = *(float*)(source0 + 4 * x);
5099							float c1 = *(float*)(source1 + 4 * x);
5100
5101							c0 = c0 + c1;
5102							c0 *= 1.0f / 2.0f;
5103
5104							*(float*)(source0 + 4 * x) = c0;
5105						}
5106
5107						source0 += pitch;
5108						source1 += pitch;
5109					}
5110				}
5111				else if(internal.samples == 4)
5112				{
5113					for(int y = 0; y < height; y++)
5114					{
5115						for(int x = 0; x < width; x++)
5116						{
5117							float c0 = *(float*)(source0 + 4 * x);
5118							float c1 = *(float*)(source1 + 4 * x);
5119							float c2 = *(float*)(source2 + 4 * x);
5120							float c3 = *(float*)(source3 + 4 * x);
5121
5122							c0 = c0 + c1;
5123							c2 = c2 + c3;
5124							c0 = c0 + c2;
5125							c0 *= 1.0f / 4.0f;
5126
5127							*(float*)(source0 + 4 * x) = c0;
5128						}
5129
5130						source0 += pitch;
5131						source1 += pitch;
5132						source2 += pitch;
5133						source3 += pitch;
5134					}
5135				}
5136				else if(internal.samples == 8)
5137				{
5138					for(int y = 0; y < height; y++)
5139					{
5140						for(int x = 0; x < width; x++)
5141						{
5142							float c0 = *(float*)(source0 + 4 * x);
5143							float c1 = *(float*)(source1 + 4 * x);
5144							float c2 = *(float*)(source2 + 4 * x);
5145							float c3 = *(float*)(source3 + 4 * x);
5146							float c4 = *(float*)(source4 + 4 * x);
5147							float c5 = *(float*)(source5 + 4 * x);
5148							float c6 = *(float*)(source6 + 4 * x);
5149							float c7 = *(float*)(source7 + 4 * x);
5150
5151							c0 = c0 + c1;
5152							c2 = c2 + c3;
5153							c4 = c4 + c5;
5154							c6 = c6 + c7;
5155							c0 = c0 + c2;
5156							c4 = c4 + c6;
5157							c0 = c0 + c4;
5158							c0 *= 1.0f / 8.0f;
5159
5160							*(float*)(source0 + 4 * x) = c0;
5161						}
5162
5163						source0 += pitch;
5164						source1 += pitch;
5165						source2 += pitch;
5166						source3 += pitch;
5167						source4 += pitch;
5168						source5 += pitch;
5169						source6 += pitch;
5170						source7 += pitch;
5171					}
5172				}
5173				else if(internal.samples == 16)
5174				{
5175					for(int y = 0; y < height; y++)
5176					{
5177						for(int x = 0; x < width; x++)
5178						{
5179							float c0 = *(float*)(source0 + 4 * x);
5180							float c1 = *(float*)(source1 + 4 * x);
5181							float c2 = *(float*)(source2 + 4 * x);
5182							float c3 = *(float*)(source3 + 4 * x);
5183							float c4 = *(float*)(source4 + 4 * x);
5184							float c5 = *(float*)(source5 + 4 * x);
5185							float c6 = *(float*)(source6 + 4 * x);
5186							float c7 = *(float*)(source7 + 4 * x);
5187							float c8 = *(float*)(source8 + 4 * x);
5188							float c9 = *(float*)(source9 + 4 * x);
5189							float cA = *(float*)(sourceA + 4 * x);
5190							float cB = *(float*)(sourceB + 4 * x);
5191							float cC = *(float*)(sourceC + 4 * x);
5192							float cD = *(float*)(sourceD + 4 * x);
5193							float cE = *(float*)(sourceE + 4 * x);
5194							float cF = *(float*)(sourceF + 4 * x);
5195
5196							c0 = c0 + c1;
5197							c2 = c2 + c3;
5198							c4 = c4 + c5;
5199							c6 = c6 + c7;
5200							c8 = c8 + c9;
5201							cA = cA + cB;
5202							cC = cC + cD;
5203							cE = cE + cF;
5204							c0 = c0 + c2;
5205							c4 = c4 + c6;
5206							c8 = c8 + cA;
5207							cC = cC + cE;
5208							c0 = c0 + c4;
5209							c8 = c8 + cC;
5210							c0 = c0 + c8;
5211							c0 *= 1.0f / 16.0f;
5212
5213							*(float*)(source0 + 4 * x) = c0;
5214						}
5215
5216						source0 += pitch;
5217						source1 += pitch;
5218						source2 += pitch;
5219						source3 += pitch;
5220						source4 += pitch;
5221						source5 += pitch;
5222						source6 += pitch;
5223						source7 += pitch;
5224						source8 += pitch;
5225						source9 += pitch;
5226						sourceA += pitch;
5227						sourceB += pitch;
5228						sourceC += pitch;
5229						sourceD += pitch;
5230						sourceE += pitch;
5231						sourceF += pitch;
5232					}
5233				}
5234				else ASSERT(false);
5235			}
5236		}
5237		else if(internal.format == FORMAT_G32R32F)
5238		{
5239			#if defined(__i386__) || defined(__x86_64__)
5240				if(CPUID::supportsSSE() && (width % 2) == 0)
5241				{
5242					if(internal.samples == 2)
5243					{
5244						for(int y = 0; y < height; y++)
5245						{
5246							for(int x = 0; x < width; x += 2)
5247							{
5248								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5249								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5250
5251								c0 = _mm_add_ps(c0, c1);
5252								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
5253
5254								_mm_store_ps((float*)(source0 + 8 * x), c0);
5255							}
5256
5257							source0 += pitch;
5258							source1 += pitch;
5259						}
5260					}
5261					else if(internal.samples == 4)
5262					{
5263						for(int y = 0; y < height; y++)
5264						{
5265							for(int x = 0; x < width; x += 2)
5266							{
5267								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5268								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5269								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
5270								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
5271
5272								c0 = _mm_add_ps(c0, c1);
5273								c2 = _mm_add_ps(c2, c3);
5274								c0 = _mm_add_ps(c0, c2);
5275								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
5276
5277								_mm_store_ps((float*)(source0 + 8 * x), c0);
5278							}
5279
5280							source0 += pitch;
5281							source1 += pitch;
5282							source2 += pitch;
5283							source3 += pitch;
5284						}
5285					}
5286					else if(internal.samples == 8)
5287					{
5288						for(int y = 0; y < height; y++)
5289						{
5290							for(int x = 0; x < width; x += 2)
5291							{
5292								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5293								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5294								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
5295								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
5296								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
5297								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
5298								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
5299								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
5300
5301								c0 = _mm_add_ps(c0, c1);
5302								c2 = _mm_add_ps(c2, c3);
5303								c4 = _mm_add_ps(c4, c5);
5304								c6 = _mm_add_ps(c6, c7);
5305								c0 = _mm_add_ps(c0, c2);
5306								c4 = _mm_add_ps(c4, c6);
5307								c0 = _mm_add_ps(c0, c4);
5308								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
5309
5310								_mm_store_ps((float*)(source0 + 8 * x), c0);
5311							}
5312
5313							source0 += pitch;
5314							source1 += pitch;
5315							source2 += pitch;
5316							source3 += pitch;
5317							source4 += pitch;
5318							source5 += pitch;
5319							source6 += pitch;
5320							source7 += pitch;
5321						}
5322					}
5323					else if(internal.samples == 16)
5324					{
5325						for(int y = 0; y < height; y++)
5326						{
5327							for(int x = 0; x < width; x += 2)
5328							{
5329								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5330								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5331								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
5332								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
5333								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
5334								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
5335								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
5336								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
5337								__m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
5338								__m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
5339								__m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
5340								__m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
5341								__m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
5342								__m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
5343								__m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
5344								__m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
5345
5346								c0 = _mm_add_ps(c0, c1);
5347								c2 = _mm_add_ps(c2, c3);
5348								c4 = _mm_add_ps(c4, c5);
5349								c6 = _mm_add_ps(c6, c7);
5350								c8 = _mm_add_ps(c8, c9);
5351								cA = _mm_add_ps(cA, cB);
5352								cC = _mm_add_ps(cC, cD);
5353								cE = _mm_add_ps(cE, cF);
5354								c0 = _mm_add_ps(c0, c2);
5355								c4 = _mm_add_ps(c4, c6);
5356								c8 = _mm_add_ps(c8, cA);
5357								cC = _mm_add_ps(cC, cE);
5358								c0 = _mm_add_ps(c0, c4);
5359								c8 = _mm_add_ps(c8, cC);
5360								c0 = _mm_add_ps(c0, c8);
5361								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5362
5363								_mm_store_ps((float*)(source0 + 8 * x), c0);
5364							}
5365
5366							source0 += pitch;
5367							source1 += pitch;
5368							source2 += pitch;
5369							source3 += pitch;
5370							source4 += pitch;
5371							source5 += pitch;
5372							source6 += pitch;
5373							source7 += pitch;
5374							source8 += pitch;
5375							source9 += pitch;
5376							sourceA += pitch;
5377							sourceB += pitch;
5378							sourceC += pitch;
5379							sourceD += pitch;
5380							sourceE += pitch;
5381							sourceF += pitch;
5382						}
5383					}
5384					else ASSERT(false);
5385				}
5386				else
5387			#endif
5388			{
5389				if(internal.samples == 2)
5390				{
5391					for(int y = 0; y < height; y++)
5392					{
5393						for(int x = 0; x < 2 * width; x++)
5394						{
5395							float c0 = *(float*)(source0 + 4 * x);
5396							float c1 = *(float*)(source1 + 4 * x);
5397
5398							c0 = c0 + c1;
5399							c0 *= 1.0f / 2.0f;
5400
5401							*(float*)(source0 + 4 * x) = c0;
5402						}
5403
5404						source0 += pitch;
5405						source1 += pitch;
5406					}
5407				}
5408				else if(internal.samples == 4)
5409				{
5410					for(int y = 0; y < height; y++)
5411					{
5412						for(int x = 0; x < 2 * width; x++)
5413						{
5414							float c0 = *(float*)(source0 + 4 * x);
5415							float c1 = *(float*)(source1 + 4 * x);
5416							float c2 = *(float*)(source2 + 4 * x);
5417							float c3 = *(float*)(source3 + 4 * x);
5418
5419							c0 = c0 + c1;
5420							c2 = c2 + c3;
5421							c0 = c0 + c2;
5422							c0 *= 1.0f / 4.0f;
5423
5424							*(float*)(source0 + 4 * x) = c0;
5425						}
5426
5427						source0 += pitch;
5428						source1 += pitch;
5429						source2 += pitch;
5430						source3 += pitch;
5431					}
5432				}
5433				else if(internal.samples == 8)
5434				{
5435					for(int y = 0; y < height; y++)
5436					{
5437						for(int x = 0; x < 2 * width; x++)
5438						{
5439							float c0 = *(float*)(source0 + 4 * x);
5440							float c1 = *(float*)(source1 + 4 * x);
5441							float c2 = *(float*)(source2 + 4 * x);
5442							float c3 = *(float*)(source3 + 4 * x);
5443							float c4 = *(float*)(source4 + 4 * x);
5444							float c5 = *(float*)(source5 + 4 * x);
5445							float c6 = *(float*)(source6 + 4 * x);
5446							float c7 = *(float*)(source7 + 4 * x);
5447
5448							c0 = c0 + c1;
5449							c2 = c2 + c3;
5450							c4 = c4 + c5;
5451							c6 = c6 + c7;
5452							c0 = c0 + c2;
5453							c4 = c4 + c6;
5454							c0 = c0 + c4;
5455							c0 *= 1.0f / 8.0f;
5456
5457							*(float*)(source0 + 4 * x) = c0;
5458						}
5459
5460						source0 += pitch;
5461						source1 += pitch;
5462						source2 += pitch;
5463						source3 += pitch;
5464						source4 += pitch;
5465						source5 += pitch;
5466						source6 += pitch;
5467						source7 += pitch;
5468					}
5469				}
5470				else if(internal.samples == 16)
5471				{
5472					for(int y = 0; y < height; y++)
5473					{
5474						for(int x = 0; x < 2 * width; x++)
5475						{
5476							float c0 = *(float*)(source0 + 4 * x);
5477							float c1 = *(float*)(source1 + 4 * x);
5478							float c2 = *(float*)(source2 + 4 * x);
5479							float c3 = *(float*)(source3 + 4 * x);
5480							float c4 = *(float*)(source4 + 4 * x);
5481							float c5 = *(float*)(source5 + 4 * x);
5482							float c6 = *(float*)(source6 + 4 * x);
5483							float c7 = *(float*)(source7 + 4 * x);
5484							float c8 = *(float*)(source8 + 4 * x);
5485							float c9 = *(float*)(source9 + 4 * x);
5486							float cA = *(float*)(sourceA + 4 * x);
5487							float cB = *(float*)(sourceB + 4 * x);
5488							float cC = *(float*)(sourceC + 4 * x);
5489							float cD = *(float*)(sourceD + 4 * x);
5490							float cE = *(float*)(sourceE + 4 * x);
5491							float cF = *(float*)(sourceF + 4 * x);
5492
5493							c0 = c0 + c1;
5494							c2 = c2 + c3;
5495							c4 = c4 + c5;
5496							c6 = c6 + c7;
5497							c8 = c8 + c9;
5498							cA = cA + cB;
5499							cC = cC + cD;
5500							cE = cE + cF;
5501							c0 = c0 + c2;
5502							c4 = c4 + c6;
5503							c8 = c8 + cA;
5504							cC = cC + cE;
5505							c0 = c0 + c4;
5506							c8 = c8 + cC;
5507							c0 = c0 + c8;
5508							c0 *= 1.0f / 16.0f;
5509
5510							*(float*)(source0 + 4 * x) = c0;
5511						}
5512
5513						source0 += pitch;
5514						source1 += pitch;
5515						source2 += pitch;
5516						source3 += pitch;
5517						source4 += pitch;
5518						source5 += pitch;
5519						source6 += pitch;
5520						source7 += pitch;
5521						source8 += pitch;
5522						source9 += pitch;
5523						sourceA += pitch;
5524						sourceB += pitch;
5525						sourceC += pitch;
5526						sourceD += pitch;
5527						sourceE += pitch;
5528						sourceF += pitch;
5529					}
5530				}
5531				else ASSERT(false);
5532			}
5533		}
5534		else if(internal.format == FORMAT_A32B32G32R32F ||
5535		        internal.format == FORMAT_X32B32G32R32F ||
5536		        internal.format == FORMAT_X32B32G32R32F_UNSIGNED)
5537		{
5538			#if defined(__i386__) || defined(__x86_64__)
5539				if(CPUID::supportsSSE())
5540				{
5541					if(internal.samples == 2)
5542					{
5543						for(int y = 0; y < height; y++)
5544						{
5545							for(int x = 0; x < width; x++)
5546							{
5547								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5548								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5549
5550								c0 = _mm_add_ps(c0, c1);
5551								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
5552
5553								_mm_store_ps((float*)(source0 + 16 * x), c0);
5554							}
5555
5556							source0 += pitch;
5557							source1 += pitch;
5558						}
5559					}
5560					else if(internal.samples == 4)
5561					{
5562						for(int y = 0; y < height; y++)
5563						{
5564							for(int x = 0; x < width; x++)
5565							{
5566								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5567								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5568								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5569								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5570
5571								c0 = _mm_add_ps(c0, c1);
5572								c2 = _mm_add_ps(c2, c3);
5573								c0 = _mm_add_ps(c0, c2);
5574								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
5575
5576								_mm_store_ps((float*)(source0 + 16 * x), c0);
5577							}
5578
5579							source0 += pitch;
5580							source1 += pitch;
5581							source2 += pitch;
5582							source3 += pitch;
5583						}
5584					}
5585					else if(internal.samples == 8)
5586					{
5587						for(int y = 0; y < height; y++)
5588						{
5589							for(int x = 0; x < width; x++)
5590							{
5591								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5592								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5593								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5594								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5595								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
5596								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
5597								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
5598								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
5599
5600								c0 = _mm_add_ps(c0, c1);
5601								c2 = _mm_add_ps(c2, c3);
5602								c4 = _mm_add_ps(c4, c5);
5603								c6 = _mm_add_ps(c6, c7);
5604								c0 = _mm_add_ps(c0, c2);
5605								c4 = _mm_add_ps(c4, c6);
5606								c0 = _mm_add_ps(c0, c4);
5607								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
5608
5609								_mm_store_ps((float*)(source0 + 16 * x), c0);
5610							}
5611
5612							source0 += pitch;
5613							source1 += pitch;
5614							source2 += pitch;
5615							source3 += pitch;
5616							source4 += pitch;
5617							source5 += pitch;
5618							source6 += pitch;
5619							source7 += pitch;
5620						}
5621					}
5622					else if(internal.samples == 16)
5623					{
5624						for(int y = 0; y < height; y++)
5625						{
5626							for(int x = 0; x < width; x++)
5627							{
5628								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5629								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5630								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5631								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5632								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
5633								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
5634								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
5635								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
5636								__m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
5637								__m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
5638								__m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
5639								__m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
5640								__m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
5641								__m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
5642								__m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
5643								__m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
5644
5645								c0 = _mm_add_ps(c0, c1);
5646								c2 = _mm_add_ps(c2, c3);
5647								c4 = _mm_add_ps(c4, c5);
5648								c6 = _mm_add_ps(c6, c7);
5649								c8 = _mm_add_ps(c8, c9);
5650								cA = _mm_add_ps(cA, cB);
5651								cC = _mm_add_ps(cC, cD);
5652								cE = _mm_add_ps(cE, cF);
5653								c0 = _mm_add_ps(c0, c2);
5654								c4 = _mm_add_ps(c4, c6);
5655								c8 = _mm_add_ps(c8, cA);
5656								cC = _mm_add_ps(cC, cE);
5657								c0 = _mm_add_ps(c0, c4);
5658								c8 = _mm_add_ps(c8, cC);
5659								c0 = _mm_add_ps(c0, c8);
5660								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5661
5662								_mm_store_ps((float*)(source0 + 16 * x), c0);
5663							}
5664
5665							source0 += pitch;
5666							source1 += pitch;
5667							source2 += pitch;
5668							source3 += pitch;
5669							source4 += pitch;
5670							source5 += pitch;
5671							source6 += pitch;
5672							source7 += pitch;
5673							source8 += pitch;
5674							source9 += pitch;
5675							sourceA += pitch;
5676							sourceB += pitch;
5677							sourceC += pitch;
5678							sourceD += pitch;
5679							sourceE += pitch;
5680							sourceF += pitch;
5681						}
5682					}
5683					else ASSERT(false);
5684				}
5685				else
5686			#endif
5687			{
5688				if(internal.samples == 2)
5689				{
5690					for(int y = 0; y < height; y++)
5691					{
5692						for(int x = 0; x < 4 * width; x++)
5693						{
5694							float c0 = *(float*)(source0 + 4 * x);
5695							float c1 = *(float*)(source1 + 4 * x);
5696
5697							c0 = c0 + c1;
5698							c0 *= 1.0f / 2.0f;
5699
5700							*(float*)(source0 + 4 * x) = c0;
5701						}
5702
5703						source0 += pitch;
5704						source1 += pitch;
5705					}
5706				}
5707				else if(internal.samples == 4)
5708				{
5709					for(int y = 0; y < height; y++)
5710					{
5711						for(int x = 0; x < 4 * width; x++)
5712						{
5713							float c0 = *(float*)(source0 + 4 * x);
5714							float c1 = *(float*)(source1 + 4 * x);
5715							float c2 = *(float*)(source2 + 4 * x);
5716							float c3 = *(float*)(source3 + 4 * x);
5717
5718							c0 = c0 + c1;
5719							c2 = c2 + c3;
5720							c0 = c0 + c2;
5721							c0 *= 1.0f / 4.0f;
5722
5723							*(float*)(source0 + 4 * x) = c0;
5724						}
5725
5726						source0 += pitch;
5727						source1 += pitch;
5728						source2 += pitch;
5729						source3 += pitch;
5730					}
5731				}
5732				else if(internal.samples == 8)
5733				{
5734					for(int y = 0; y < height; y++)
5735					{
5736						for(int x = 0; x < 4 * width; x++)
5737						{
5738							float c0 = *(float*)(source0 + 4 * x);
5739							float c1 = *(float*)(source1 + 4 * x);
5740							float c2 = *(float*)(source2 + 4 * x);
5741							float c3 = *(float*)(source3 + 4 * x);
5742							float c4 = *(float*)(source4 + 4 * x);
5743							float c5 = *(float*)(source5 + 4 * x);
5744							float c6 = *(float*)(source6 + 4 * x);
5745							float c7 = *(float*)(source7 + 4 * x);
5746
5747							c0 = c0 + c1;
5748							c2 = c2 + c3;
5749							c4 = c4 + c5;
5750							c6 = c6 + c7;
5751							c0 = c0 + c2;
5752							c4 = c4 + c6;
5753							c0 = c0 + c4;
5754							c0 *= 1.0f / 8.0f;
5755
5756							*(float*)(source0 + 4 * x) = c0;
5757						}
5758
5759						source0 += pitch;
5760						source1 += pitch;
5761						source2 += pitch;
5762						source3 += pitch;
5763						source4 += pitch;
5764						source5 += pitch;
5765						source6 += pitch;
5766						source7 += pitch;
5767					}
5768				}
5769				else if(internal.samples == 16)
5770				{
5771					for(int y = 0; y < height; y++)
5772					{
5773						for(int x = 0; x < 4 * width; x++)
5774						{
5775							float c0 = *(float*)(source0 + 4 * x);
5776							float c1 = *(float*)(source1 + 4 * x);
5777							float c2 = *(float*)(source2 + 4 * x);
5778							float c3 = *(float*)(source3 + 4 * x);
5779							float c4 = *(float*)(source4 + 4 * x);
5780							float c5 = *(float*)(source5 + 4 * x);
5781							float c6 = *(float*)(source6 + 4 * x);
5782							float c7 = *(float*)(source7 + 4 * x);
5783							float c8 = *(float*)(source8 + 4 * x);
5784							float c9 = *(float*)(source9 + 4 * x);
5785							float cA = *(float*)(sourceA + 4 * x);
5786							float cB = *(float*)(sourceB + 4 * x);
5787							float cC = *(float*)(sourceC + 4 * x);
5788							float cD = *(float*)(sourceD + 4 * x);
5789							float cE = *(float*)(sourceE + 4 * x);
5790							float cF = *(float*)(sourceF + 4 * x);
5791
5792							c0 = c0 + c1;
5793							c2 = c2 + c3;
5794							c4 = c4 + c5;
5795							c6 = c6 + c7;
5796							c8 = c8 + c9;
5797							cA = cA + cB;
5798							cC = cC + cD;
5799							cE = cE + cF;
5800							c0 = c0 + c2;
5801							c4 = c4 + c6;
5802							c8 = c8 + cA;
5803							cC = cC + cE;
5804							c0 = c0 + c4;
5805							c8 = c8 + cC;
5806							c0 = c0 + c8;
5807							c0 *= 1.0f / 16.0f;
5808
5809							*(float*)(source0 + 4 * x) = c0;
5810						}
5811
5812						source0 += pitch;
5813						source1 += pitch;
5814						source2 += pitch;
5815						source3 += pitch;
5816						source4 += pitch;
5817						source5 += pitch;
5818						source6 += pitch;
5819						source7 += pitch;
5820						source8 += pitch;
5821						source9 += pitch;
5822						sourceA += pitch;
5823						sourceB += pitch;
5824						sourceC += pitch;
5825						sourceD += pitch;
5826						sourceE += pitch;
5827						sourceF += pitch;
5828					}
5829				}
5830				else ASSERT(false);
5831			}
5832		}
5833		else if(internal.format == FORMAT_R5G6B5)
5834		{
5835			#if defined(__i386__) || defined(__x86_64__)
5836				if(CPUID::supportsSSE2() && (width % 8) == 0)
5837				{
5838					if(internal.samples == 2)
5839					{
5840						for(int y = 0; y < height; y++)
5841						{
5842							for(int x = 0; x < width; x += 8)
5843							{
5844								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5845								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5846
5847								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5848								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5849								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5850								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5851								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5852								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5853
5854								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5855								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5856								c1 = _mm_avg_epu16(c0__g_, c1__g_);
5857								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5858								c0 = _mm_or_si128(c0, c1);
5859
5860								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5861							}
5862
5863							source0 += pitch;
5864							source1 += pitch;
5865						}
5866					}
5867					else if(internal.samples == 4)
5868					{
5869						for(int y = 0; y < height; y++)
5870						{
5871							for(int x = 0; x < width; x += 8)
5872							{
5873								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5874								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5875								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5876								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5877
5878								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5879								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5880								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5881								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5882								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5883								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5884								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5885								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5886								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5887								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5888
5889								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5890								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5891								c0 = _mm_avg_epu8(c0, c2);
5892								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5893								c1 = _mm_avg_epu16(c0__g_, c1__g_);
5894								c3 = _mm_avg_epu16(c2__g_, c3__g_);
5895								c1 = _mm_avg_epu16(c1, c3);
5896								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5897								c0 = _mm_or_si128(c0, c1);
5898
5899								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5900							}
5901
5902							source0 += pitch;
5903							source1 += pitch;
5904							source2 += pitch;
5905							source3 += pitch;
5906						}
5907					}
5908					else if(internal.samples == 8)
5909					{
5910						for(int y = 0; y < height; y++)
5911						{
5912							for(int x = 0; x < width; x += 8)
5913							{
5914								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5915								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5916								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5917								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5918								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
5919								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
5920								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
5921								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
5922
5923								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5924								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5925								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5926								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5927								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5928								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5929								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5930								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5931								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5932								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5933								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
5934								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
5935								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
5936								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
5937								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
5938								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
5939								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
5940								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
5941
5942								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5943								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5944								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
5945								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
5946								c0 = _mm_avg_epu8(c0, c2);
5947								c4 = _mm_avg_epu8(c4, c6);
5948								c0 = _mm_avg_epu8(c0, c4);
5949								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5950								c1 = _mm_avg_epu16(c0__g_, c1__g_);
5951								c3 = _mm_avg_epu16(c2__g_, c3__g_);
5952								c5 = _mm_avg_epu16(c4__g_, c5__g_);
5953								c7 = _mm_avg_epu16(c6__g_, c7__g_);
5954								c1 = _mm_avg_epu16(c1, c3);
5955								c5 = _mm_avg_epu16(c5, c7);
5956								c1 = _mm_avg_epu16(c1, c5);
5957								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5958								c0 = _mm_or_si128(c0, c1);
5959
5960								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5961							}
5962
5963							source0 += pitch;
5964							source1 += pitch;
5965							source2 += pitch;
5966							source3 += pitch;
5967							source4 += pitch;
5968							source5 += pitch;
5969							source6 += pitch;
5970							source7 += pitch;
5971						}
5972					}
5973					else if(internal.samples == 16)
5974					{
5975						for(int y = 0; y < height; y++)
5976						{
5977							for(int x = 0; x < width; x += 8)
5978							{
5979								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5980								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5981								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5982								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5983								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
5984								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
5985								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
5986								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
5987								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 2 * x));
5988								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 2 * x));
5989								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 2 * x));
5990								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 2 * x));
5991								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 2 * x));
5992								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 2 * x));
5993								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 2 * x));
5994								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 2 * x));
5995
5996								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5997								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5998								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5999								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
6000								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
6001								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
6002								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
6003								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
6004								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
6005								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
6006								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
6007								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
6008								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
6009								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
6010								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
6011								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
6012								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
6013								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
6014								__m128i c8_r_b = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(r_b));
6015								__m128i c8__g_ = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(_g_));
6016								__m128i c9_r_b = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(r_b));
6017								__m128i c9__g_ = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(_g_));
6018								__m128i cA_r_b = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(r_b));
6019								__m128i cA__g_ = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(_g_));
6020								__m128i cB_r_b = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(r_b));
6021								__m128i cB__g_ = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(_g_));
6022								__m128i cC_r_b = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(r_b));
6023								__m128i cC__g_ = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(_g_));
6024								__m128i cD_r_b = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(r_b));
6025								__m128i cD__g_ = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(_g_));
6026								__m128i cE_r_b = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(r_b));
6027								__m128i cE__g_ = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(_g_));
6028								__m128i cF_r_b = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(r_b));
6029								__m128i cF__g_ = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(_g_));
6030
6031								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
6032								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
6033								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
6034								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
6035								c8 = _mm_avg_epu8(c8_r_b, c9_r_b);
6036								cA = _mm_avg_epu8(cA_r_b, cB_r_b);
6037								cC = _mm_avg_epu8(cC_r_b, cD_r_b);
6038								cE = _mm_avg_epu8(cE_r_b, cF_r_b);
6039								c0 = _mm_avg_epu8(c0, c2);
6040								c4 = _mm_avg_epu8(c4, c6);
6041								c8 = _mm_avg_epu8(c8, cA);
6042								cC = _mm_avg_epu8(cC, cE);
6043								c0 = _mm_avg_epu8(c0, c4);
6044								c8 = _mm_avg_epu8(c8, cC);
6045								c0 = _mm_avg_epu8(c0, c8);
6046								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
6047								c1 = _mm_avg_epu16(c0__g_, c1__g_);
6048								c3 = _mm_avg_epu16(c2__g_, c3__g_);
6049								c5 = _mm_avg_epu16(c4__g_, c5__g_);
6050								c7 = _mm_avg_epu16(c6__g_, c7__g_);
6051								c9 = _mm_avg_epu16(c8__g_, c9__g_);
6052								cB = _mm_avg_epu16(cA__g_, cB__g_);
6053								cD = _mm_avg_epu16(cC__g_, cD__g_);
6054								cF = _mm_avg_epu16(cE__g_, cF__g_);
6055								c1 = _mm_avg_epu8(c1, c3);
6056								c5 = _mm_avg_epu8(c5, c7);
6057								c9 = _mm_avg_epu8(c9, cB);
6058								cD = _mm_avg_epu8(cD, cF);
6059								c1 = _mm_avg_epu8(c1, c5);
6060								c9 = _mm_avg_epu8(c9, cD);
6061								c1 = _mm_avg_epu8(c1, c9);
6062								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
6063								c0 = _mm_or_si128(c0, c1);
6064
6065								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
6066							}
6067
6068							source0 += pitch;
6069							source1 += pitch;
6070							source2 += pitch;
6071							source3 += pitch;
6072							source4 += pitch;
6073							source5 += pitch;
6074							source6 += pitch;
6075							source7 += pitch;
6076							source8 += pitch;
6077							source9 += pitch;
6078							sourceA += pitch;
6079							sourceB += pitch;
6080							sourceC += pitch;
6081							sourceD += pitch;
6082							sourceE += pitch;
6083							sourceF += pitch;
6084						}
6085					}
6086					else ASSERT(false);
6087				}
6088				else
6089			#endif
6090			{
6091				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7BEF) + (((x) ^ (y)) & 0x0821))
6092
6093				if(internal.samples == 2)
6094				{
6095					for(int y = 0; y < height; y++)
6096					{
6097						for(int x = 0; x < width; x++)
6098						{
6099							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
6100							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
6101
6102							c0 = AVERAGE(c0, c1);
6103
6104							*(unsigned short*)(source0 + 2 * x) = c0;
6105						}
6106
6107						source0 += pitch;
6108						source1 += pitch;
6109					}
6110				}
6111				else if(internal.samples == 4)
6112				{
6113					for(int y = 0; y < height; y++)
6114					{
6115						for(int x = 0; x < width; x++)
6116						{
6117							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
6118							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
6119							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
6120							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
6121
6122							c0 = AVERAGE(c0, c1);
6123							c2 = AVERAGE(c2, c3);
6124							c0 = AVERAGE(c0, c2);
6125
6126							*(unsigned short*)(source0 + 2 * x) = c0;
6127						}
6128
6129						source0 += pitch;
6130						source1 += pitch;
6131						source2 += pitch;
6132						source3 += pitch;
6133					}
6134				}
6135				else if(internal.samples == 8)
6136				{
6137					for(int y = 0; y < height; y++)
6138					{
6139						for(int x = 0; x < width; x++)
6140						{
6141							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
6142							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
6143							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
6144							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
6145							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
6146							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
6147							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
6148							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
6149
6150							c0 = AVERAGE(c0, c1);
6151							c2 = AVERAGE(c2, c3);
6152							c4 = AVERAGE(c4, c5);
6153							c6 = AVERAGE(c6, c7);
6154							c0 = AVERAGE(c0, c2);
6155							c4 = AVERAGE(c4, c6);
6156							c0 = AVERAGE(c0, c4);
6157
6158							*(unsigned short*)(source0 + 2 * x) = c0;
6159						}
6160
6161						source0 += pitch;
6162						source1 += pitch;
6163						source2 += pitch;
6164						source3 += pitch;
6165						source4 += pitch;
6166						source5 += pitch;
6167						source6 += pitch;
6168						source7 += pitch;
6169					}
6170				}
6171				else if(internal.samples == 16)
6172				{
6173					for(int y = 0; y < height; y++)
6174					{
6175						for(int x = 0; x < width; x++)
6176						{
6177							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
6178							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
6179							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
6180							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
6181							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
6182							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
6183							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
6184							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
6185							unsigned short c8 = *(unsigned short*)(source8 + 2 * x);
6186							unsigned short c9 = *(unsigned short*)(source9 + 2 * x);
6187							unsigned short cA = *(unsigned short*)(sourceA + 2 * x);
6188							unsigned short cB = *(unsigned short*)(sourceB + 2 * x);
6189							unsigned short cC = *(unsigned short*)(sourceC + 2 * x);
6190							unsigned short cD = *(unsigned short*)(sourceD + 2 * x);
6191							unsigned short cE = *(unsigned short*)(sourceE + 2 * x);
6192							unsigned short cF = *(unsigned short*)(sourceF + 2 * x);
6193
6194							c0 = AVERAGE(c0, c1);
6195							c2 = AVERAGE(c2, c3);
6196							c4 = AVERAGE(c4, c5);
6197							c6 = AVERAGE(c6, c7);
6198							c8 = AVERAGE(c8, c9);
6199							cA = AVERAGE(cA, cB);
6200							cC = AVERAGE(cC, cD);
6201							cE = AVERAGE(cE, cF);
6202							c0 = AVERAGE(c0, c2);
6203							c4 = AVERAGE(c4, c6);
6204							c8 = AVERAGE(c8, cA);
6205							cC = AVERAGE(cC, cE);
6206							c0 = AVERAGE(c0, c4);
6207							c8 = AVERAGE(c8, cC);
6208							c0 = AVERAGE(c0, c8);
6209
6210							*(unsigned short*)(source0 + 2 * x) = c0;
6211						}
6212
6213						source0 += pitch;
6214						source1 += pitch;
6215						source2 += pitch;
6216						source3 += pitch;
6217						source4 += pitch;
6218						source5 += pitch;
6219						source6 += pitch;
6220						source7 += pitch;
6221						source8 += pitch;
6222						source9 += pitch;
6223						sourceA += pitch;
6224						sourceB += pitch;
6225						sourceC += pitch;
6226						sourceD += pitch;
6227						sourceE += pitch;
6228						sourceF += pitch;
6229					}
6230				}
6231				else ASSERT(false);
6232
6233				#undef AVERAGE
6234			}
6235		}
6236		else
6237		{
6238		//	UNIMPLEMENTED();
6239		}
6240	}
6241}
6242