Surface.cpp revision 0bac285a78df6a6d7a6b68784748b92805420ffb
1// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//    http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include "Surface.hpp"
16
17#include "Color.hpp"
18#include "Context.hpp"
19#include "ETC_Decoder.hpp"
20#include "Renderer.hpp"
21#include "Common/Half.hpp"
22#include "Common/Memory.hpp"
23#include "Common/CPUID.hpp"
24#include "Common/Resource.hpp"
25#include "Common/Debug.hpp"
26#include "Reactor/Reactor.hpp"
27
28#include <xmmintrin.h>
29#include <emmintrin.h>
30
31#undef min
32#undef max
33
34namespace sw
35{
36	extern bool quadLayoutEnabled;
37	extern bool complementaryDepthBuffer;
38	extern TranscendentalPrecision logPrecision;
39
40	unsigned int *Surface::palette = 0;
41	unsigned int Surface::paletteID = 0;
42
43	void Rect::clip(int minX, int minY, int maxX, int maxY)
44	{
45		x0 = clamp(x0, minX, maxX);
46		y0 = clamp(y0, minY, maxY);
47		x1 = clamp(x1, minX, maxX);
48		y1 = clamp(y1, minY, maxY);
49	}
50
51	void Surface::Buffer::write(int x, int y, int z, const Color<float> &color)
52	{
53		void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
54
55		write(element, color);
56	}
57
58	void Surface::Buffer::write(int x, int y, const Color<float> &color)
59	{
60		void *element = (unsigned char*)buffer + x * bytes + y * pitchB;
61
62		write(element, color);
63	}
64
65	inline void Surface::Buffer::write(void *element, const Color<float> &color)
66	{
67		switch(format)
68		{
69		case FORMAT_A8:
70			*(unsigned char*)element = unorm<8>(color.a);
71			break;
72		case FORMAT_R8I_SNORM:
73			*(char*)element = snorm<8>(color.r);
74			break;
75		case FORMAT_R8:
76			*(unsigned char*)element = unorm<8>(color.r);
77			break;
78		case FORMAT_R8I:
79			*(char*)element = scast<8>(color.r);
80			break;
81		case FORMAT_R8UI:
82			*(unsigned char*)element = ucast<8>(color.r);
83			break;
84		case FORMAT_R16I:
85			*(short*)element = scast<16>(color.r);
86			break;
87		case FORMAT_R16UI:
88			*(unsigned short*)element = ucast<16>(color.r);
89			break;
90		case FORMAT_R32I:
91			*(int*)element = static_cast<int>(color.r);
92			break;
93		case FORMAT_R32UI:
94			*(unsigned int*)element = static_cast<unsigned int>(color.r);
95			break;
96		case FORMAT_R3G3B2:
97			*(unsigned char*)element = (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
98			break;
99		case FORMAT_A8R3G3B2:
100			*(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
101			break;
102		case FORMAT_X4R4G4B4:
103			*(unsigned short*)element = 0xF000 | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
104			break;
105		case FORMAT_A4R4G4B4:
106			*(unsigned short*)element = (unorm<4>(color.a) << 12) | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
107			break;
108		case FORMAT_R4G4B4A4:
109			*(unsigned short*)element = (unorm<4>(color.r) << 12) | (unorm<4>(color.g) << 8) | (unorm<4>(color.b) << 4) | (unorm<4>(color.a) << 0);
110			break;
111		case FORMAT_R5G6B5:
112			*(unsigned short*)element = (unorm<5>(color.r) << 11) | (unorm<6>(color.g) << 5) | (unorm<5>(color.b) << 0);
113			break;
114		case FORMAT_A1R5G5B5:
115			*(unsigned short*)element = (unorm<1>(color.a) << 15) | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
116			break;
117		case FORMAT_R5G5B5A1:
118			*(unsigned short*)element = (unorm<5>(color.r) << 11) | (unorm<5>(color.g) << 6) | (unorm<5>(color.b) << 1) | (unorm<5>(color.a) << 0);
119			break;
120		case FORMAT_X1R5G5B5:
121			*(unsigned short*)element = 0x8000 | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
122			break;
123		case FORMAT_A8R8G8B8:
124			*(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
125			break;
126		case FORMAT_X8R8G8B8:
127			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
128			break;
129		case FORMAT_A8B8G8R8I_SNORM:
130			*(unsigned int*)element = (static_cast<unsigned int>(snorm<8>(color.a)) << 24) |
131			                          (static_cast<unsigned int>(snorm<8>(color.b)) << 16) |
132			                          (static_cast<unsigned int>(snorm<8>(color.g)) << 8) |
133			                          (static_cast<unsigned int>(snorm<8>(color.r)) << 0);
134			break;
135		case FORMAT_A8B8G8R8:
136		case FORMAT_SRGB8_A8:
137			*(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
138			break;
139		case FORMAT_A8B8G8R8I:
140			*(unsigned int*)element = (static_cast<unsigned int>(scast<8>(color.a)) << 24) |
141			                          (static_cast<unsigned int>(scast<8>(color.b)) << 16) |
142			                          (static_cast<unsigned int>(scast<8>(color.g)) << 8) |
143			                          (static_cast<unsigned int>(scast<8>(color.r)) << 0);
144			break;
145		case FORMAT_A8B8G8R8UI:
146			*(unsigned int*)element = (ucast<8>(color.a) << 24) | (ucast<8>(color.b) << 16) | (ucast<8>(color.g) << 8) | (ucast<8>(color.r) << 0);
147			break;
148		case FORMAT_X8B8G8R8I_SNORM:
149			*(unsigned int*)element = 0x7F000000 |
150			                          (static_cast<unsigned int>(snorm<8>(color.b)) << 16) |
151			                          (static_cast<unsigned int>(snorm<8>(color.g)) << 8) |
152			                          (static_cast<unsigned int>(snorm<8>(color.r)) << 0);
153			break;
154		case FORMAT_X8B8G8R8:
155		case FORMAT_SRGB8_X8:
156			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
157			break;
158		case FORMAT_X8B8G8R8I:
159			*(unsigned int*)element = 0x7F000000 |
160			                          (static_cast<unsigned int>(scast<8>(color.b)) << 16) |
161			                          (static_cast<unsigned int>(scast<8>(color.g)) << 8) |
162			                          (static_cast<unsigned int>(scast<8>(color.r)) << 0);
163		case FORMAT_X8B8G8R8UI:
164			*(unsigned int*)element = 0xFF000000 | (ucast<8>(color.b) << 16) | (ucast<8>(color.g) << 8) | (ucast<8>(color.r) << 0);
165			break;
166		case FORMAT_A2R10G10B10:
167			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.r) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.b) << 0);
168			break;
169		case FORMAT_A2B10G10R10:
170			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.b) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.r) << 0);
171			break;
172		case FORMAT_G8R8I_SNORM:
173			*(unsigned short*)element = (static_cast<unsigned short>(snorm<8>(color.g)) << 8) |
174			                            (static_cast<unsigned short>(snorm<8>(color.r)) << 0);
175			break;
176		case FORMAT_G8R8:
177			*(unsigned short*)element = (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
178			break;
179		case FORMAT_G8R8I:
180			*(unsigned short*)element = (static_cast<unsigned short>(scast<8>(color.g)) << 8) |
181			                            (static_cast<unsigned short>(scast<8>(color.r)) << 0);
182			break;
183		case FORMAT_G8R8UI:
184			*(unsigned short*)element = (ucast<8>(color.g) << 8) | (ucast<8>(color.r) << 0);
185			break;
186		case FORMAT_G16R16:
187			*(unsigned int*)element = (unorm<16>(color.g) << 16) | (unorm<16>(color.r) << 0);
188			break;
189		case FORMAT_G16R16I:
190			*(unsigned int*)element = (static_cast<unsigned int>(scast<16>(color.g)) << 16) |
191			                          (static_cast<unsigned int>(scast<16>(color.r)) << 0);
192			break;
193		case FORMAT_G16R16UI:
194			*(unsigned int*)element = (ucast<16>(color.g) << 16) | (ucast<16>(color.r) << 0);
195			break;
196		case FORMAT_G32R32I:
197		case FORMAT_G32R32UI:
198			((unsigned int*)element)[0] = static_cast<unsigned int>(color.r);
199			((unsigned int*)element)[1] = static_cast<unsigned int>(color.g);
200			break;
201		case FORMAT_A16B16G16R16:
202			((unsigned short*)element)[0] = unorm<16>(color.r);
203			((unsigned short*)element)[1] = unorm<16>(color.g);
204			((unsigned short*)element)[2] = unorm<16>(color.b);
205			((unsigned short*)element)[3] = unorm<16>(color.a);
206			break;
207		case FORMAT_A16B16G16R16I:
208			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(color.r));
209			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(color.g));
210			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(color.b));
211			((unsigned short*)element)[3] = static_cast<unsigned short>(scast<16>(color.a));
212			break;
213		case FORMAT_A16B16G16R16UI:
214			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(color.r));
215			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(color.g));
216			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(color.b));
217			((unsigned short*)element)[3] = static_cast<unsigned short>(ucast<16>(color.a));
218			break;
219		case FORMAT_X16B16G16R16I:
220			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(color.r));
221			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(color.g));
222			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(color.b));
223			break;
224		case FORMAT_X16B16G16R16UI:
225			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(color.r));
226			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(color.g));
227			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(color.b));
228			break;
229		case FORMAT_A32B32G32R32I:
230		case FORMAT_A32B32G32R32UI:
231			((unsigned int*)element)[0] = static_cast<unsigned int>(color.r);
232			((unsigned int*)element)[1] = static_cast<unsigned int>(color.g);
233			((unsigned int*)element)[2] = static_cast<unsigned int>(color.b);
234			((unsigned int*)element)[3] = static_cast<unsigned int>(color.a);
235			break;
236		case FORMAT_X32B32G32R32I:
237		case FORMAT_X32B32G32R32UI:
238			((unsigned int*)element)[0] = static_cast<unsigned int>(color.r);
239			((unsigned int*)element)[1] = static_cast<unsigned int>(color.g);
240			((unsigned int*)element)[2] = static_cast<unsigned int>(color.b);
241			break;
242		case FORMAT_V8U8:
243			*(unsigned short*)element = (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
244			break;
245		case FORMAT_L6V5U5:
246			*(unsigned short*)element = (unorm<6>(color.b) << 10) | (snorm<5>(color.g) << 5) | (snorm<5>(color.r) << 0);
247			break;
248		case FORMAT_Q8W8V8U8:
249			*(unsigned int*)element = (snorm<8>(color.a) << 24) | (snorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
250			break;
251		case FORMAT_X8L8V8U8:
252			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
253			break;
254		case FORMAT_V16U16:
255			*(unsigned int*)element = (snorm<16>(color.g) << 16) | (snorm<16>(color.r) << 0);
256			break;
257		case FORMAT_A2W10V10U10:
258			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (snorm<10>(color.b) << 20) | (snorm<10>(color.g) << 10) | (snorm<10>(color.r) << 0);
259			break;
260		case FORMAT_A16W16V16U16:
261			((unsigned short*)element)[0] = snorm<16>(color.r);
262			((unsigned short*)element)[1] = snorm<16>(color.g);
263			((unsigned short*)element)[2] = snorm<16>(color.b);
264			((unsigned short*)element)[3] = unorm<16>(color.a);
265			break;
266		case FORMAT_Q16W16V16U16:
267			((unsigned short*)element)[0] = snorm<16>(color.r);
268			((unsigned short*)element)[1] = snorm<16>(color.g);
269			((unsigned short*)element)[2] = snorm<16>(color.b);
270			((unsigned short*)element)[3] = snorm<16>(color.a);
271			break;
272		case FORMAT_R8G8B8:
273			((unsigned char*)element)[0] = unorm<8>(color.b);
274			((unsigned char*)element)[1] = unorm<8>(color.g);
275			((unsigned char*)element)[2] = unorm<8>(color.r);
276			break;
277		case FORMAT_B8G8R8:
278			((unsigned char*)element)[0] = unorm<8>(color.r);
279			((unsigned char*)element)[1] = unorm<8>(color.g);
280			((unsigned char*)element)[2] = unorm<8>(color.b);
281			break;
282		case FORMAT_R16F:
283			*(half*)element = (half)color.r;
284			break;
285		case FORMAT_A16F:
286			*(half*)element = (half)color.a;
287			break;
288		case FORMAT_G16R16F:
289			((half*)element)[0] = (half)color.r;
290			((half*)element)[1] = (half)color.g;
291			break;
292		case FORMAT_B16G16R16F:
293			((half*)element)[0] = (half)color.r;
294			((half*)element)[1] = (half)color.g;
295			((half*)element)[2] = (half)color.b;
296			break;
297		case FORMAT_A16B16G16R16F:
298			((half*)element)[0] = (half)color.r;
299			((half*)element)[1] = (half)color.g;
300			((half*)element)[2] = (half)color.b;
301			((half*)element)[3] = (half)color.a;
302			break;
303		case FORMAT_A32F:
304			*(float*)element = color.a;
305			break;
306		case FORMAT_R32F:
307			*(float*)element = color.r;
308			break;
309		case FORMAT_G32R32F:
310			((float*)element)[0] = color.r;
311			((float*)element)[1] = color.g;
312			break;
313		case FORMAT_X32B32G32R32F:
314			((float*)element)[3] = 1.0f;
315		case FORMAT_B32G32R32F:
316			((float*)element)[0] = color.r;
317			((float*)element)[1] = color.g;
318			((float*)element)[2] = color.b;
319			break;
320		case FORMAT_A32B32G32R32F:
321			((float*)element)[0] = color.r;
322			((float*)element)[1] = color.g;
323			((float*)element)[2] = color.b;
324			((float*)element)[3] = color.a;
325			break;
326		case FORMAT_D32F:
327		case FORMAT_D32F_LOCKABLE:
328		case FORMAT_D32FS8_TEXTURE:
329		case FORMAT_D32FS8_SHADOW:
330			*((float*)element) = color.r;
331			break;
332		case FORMAT_D32F_COMPLEMENTARY:
333			*((float*)element) = 1 - color.r;
334			break;
335		case FORMAT_S8:
336			*((unsigned char*)element) = unorm<8>(color.r);
337			break;
338		case FORMAT_L8:
339			*(unsigned char*)element = unorm<8>(color.r);
340			break;
341		case FORMAT_A4L4:
342			*(unsigned char*)element = (unorm<4>(color.a) << 4) | (unorm<4>(color.r) << 0);
343			break;
344		case FORMAT_L16:
345			*(unsigned short*)element = unorm<16>(color.r);
346			break;
347		case FORMAT_A8L8:
348			*(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<8>(color.r) << 0);
349			break;
350		case FORMAT_L16F:
351			*(half*)element = (half)color.r;
352			break;
353		case FORMAT_A16L16F:
354			((half*)element)[0] = (half)color.r;
355			((half*)element)[1] = (half)color.a;
356			break;
357		case FORMAT_L32F:
358			*(float*)element = color.r;
359			break;
360		case FORMAT_A32L32F:
361			((float*)element)[0] = color.r;
362			((float*)element)[1] = color.a;
363			break;
364		default:
365			ASSERT(false);
366		}
367	}
368
369	Color<float> Surface::Buffer::read(int x, int y, int z) const
370	{
371		void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
372
373		return read(element);
374	}
375
376	Color<float> Surface::Buffer::read(int x, int y) const
377	{
378		void *element = (unsigned char*)buffer + x * bytes + y * pitchB;
379
380		return read(element);
381	}
382
383	inline Color<float> Surface::Buffer::read(void *element) const
384	{
385		float r = 0.0f;
386		float g = 0.0f;
387		float b = 0.0f;
388		float a = 1.0f;
389
390		switch(format)
391		{
392		case FORMAT_P8:
393			{
394				ASSERT(palette);
395
396				unsigned int abgr = palette[*(unsigned char*)element];
397
398				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
399				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
400				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
401				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
402			}
403			break;
404		case FORMAT_A8P8:
405			{
406				ASSERT(palette);
407
408				unsigned int bgr = palette[((unsigned char*)element)[0]];
409
410				r = (bgr & 0x000000FF) * (1.0f / 0x000000FF);
411				g = (bgr & 0x0000FF00) * (1.0f / 0x0000FF00);
412				b = (bgr & 0x00FF0000) * (1.0f / 0x00FF0000);
413				a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
414			}
415			break;
416		case FORMAT_A8:
417			r = 0;
418			g = 0;
419			b = 0;
420			a = *(unsigned char*)element * (1.0f / 0xFF);
421			break;
422		case FORMAT_R8I_SNORM:
423			r = max((*(signed char*)element) * (1.0f / 0x7F), -1.0f);
424			break;
425		case FORMAT_R8:
426			r = *(unsigned char*)element * (1.0f / 0xFF);
427			break;
428		case FORMAT_R8I:
429			r = *(signed char*)element;
430			break;
431		case FORMAT_R8UI:
432			r = *(unsigned char*)element;
433			break;
434		case FORMAT_R3G3B2:
435			{
436				unsigned char rgb = *(unsigned char*)element;
437
438				r = (rgb & 0xE0) * (1.0f / 0xE0);
439				g = (rgb & 0x1C) * (1.0f / 0x1C);
440				b = (rgb & 0x03) * (1.0f / 0x03);
441			}
442			break;
443		case FORMAT_A8R3G3B2:
444			{
445				unsigned short argb = *(unsigned short*)element;
446
447				a = (argb & 0xFF00) * (1.0f / 0xFF00);
448				r = (argb & 0x00E0) * (1.0f / 0x00E0);
449				g = (argb & 0x001C) * (1.0f / 0x001C);
450				b = (argb & 0x0003) * (1.0f / 0x0003);
451			}
452			break;
453		case FORMAT_X4R4G4B4:
454			{
455				unsigned short rgb = *(unsigned short*)element;
456
457				r = (rgb & 0x0F00) * (1.0f / 0x0F00);
458				g = (rgb & 0x00F0) * (1.0f / 0x00F0);
459				b = (rgb & 0x000F) * (1.0f / 0x000F);
460			}
461			break;
462		case FORMAT_A4R4G4B4:
463			{
464				unsigned short argb = *(unsigned short*)element;
465
466				a = (argb & 0xF000) * (1.0f / 0xF000);
467				r = (argb & 0x0F00) * (1.0f / 0x0F00);
468				g = (argb & 0x00F0) * (1.0f / 0x00F0);
469				b = (argb & 0x000F) * (1.0f / 0x000F);
470			}
471			break;
472		case FORMAT_R4G4B4A4:
473			{
474				unsigned short rgba = *(unsigned short*)element;
475
476				r = (rgba & 0xF000) * (1.0f / 0xF000);
477				g = (rgba & 0x0F00) * (1.0f / 0x0F00);
478				b = (rgba & 0x00F0) * (1.0f / 0x00F0);
479				a = (rgba & 0x000F) * (1.0f / 0x000F);
480			}
481			break;
482		case FORMAT_R5G6B5:
483			{
484				unsigned short rgb = *(unsigned short*)element;
485
486				r = (rgb & 0xF800) * (1.0f / 0xF800);
487				g = (rgb & 0x07E0) * (1.0f / 0x07E0);
488				b = (rgb & 0x001F) * (1.0f / 0x001F);
489			}
490			break;
491		case FORMAT_A1R5G5B5:
492			{
493				unsigned short argb = *(unsigned short*)element;
494
495				a = (argb & 0x8000) * (1.0f / 0x8000);
496				r = (argb & 0x7C00) * (1.0f / 0x7C00);
497				g = (argb & 0x03E0) * (1.0f / 0x03E0);
498				b = (argb & 0x001F) * (1.0f / 0x001F);
499			}
500			break;
501		case FORMAT_R5G5B5A1:
502			{
503				unsigned short rgba = *(unsigned short*)element;
504
505				r = (rgba & 0xF800) * (1.0f / 0xF800);
506				g = (rgba & 0x07C0) * (1.0f / 0x07C0);
507				b = (rgba & 0x003E) * (1.0f / 0x003E);
508				a = (rgba & 0x0001) * (1.0f / 0x0001);
509			}
510			break;
511		case FORMAT_X1R5G5B5:
512			{
513				unsigned short xrgb = *(unsigned short*)element;
514
515				r = (xrgb & 0x7C00) * (1.0f / 0x7C00);
516				g = (xrgb & 0x03E0) * (1.0f / 0x03E0);
517				b = (xrgb & 0x001F) * (1.0f / 0x001F);
518			}
519			break;
520		case FORMAT_A8R8G8B8:
521			{
522				unsigned int argb = *(unsigned int*)element;
523
524				a = (argb & 0xFF000000) * (1.0f / 0xFF000000);
525				r = (argb & 0x00FF0000) * (1.0f / 0x00FF0000);
526				g = (argb & 0x0000FF00) * (1.0f / 0x0000FF00);
527				b = (argb & 0x000000FF) * (1.0f / 0x000000FF);
528			}
529			break;
530		case FORMAT_X8R8G8B8:
531			{
532				unsigned int xrgb = *(unsigned int*)element;
533
534				r = (xrgb & 0x00FF0000) * (1.0f / 0x00FF0000);
535				g = (xrgb & 0x0000FF00) * (1.0f / 0x0000FF00);
536				b = (xrgb & 0x000000FF) * (1.0f / 0x000000FF);
537			}
538			break;
539		case FORMAT_A8B8G8R8I_SNORM:
540			{
541				signed char* abgr = (signed char*)element;
542
543				r = max(abgr[0] * (1.0f / 0x7F), -1.0f);
544				g = max(abgr[1] * (1.0f / 0x7F), -1.0f);
545				b = max(abgr[2] * (1.0f / 0x7F), -1.0f);
546				a = max(abgr[3] * (1.0f / 0x7F), -1.0f);
547			}
548			break;
549		case FORMAT_A8B8G8R8:
550		case FORMAT_SRGB8_A8:
551			{
552				unsigned int abgr = *(unsigned int*)element;
553
554				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
555				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
556				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
557				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
558			}
559			break;
560		case FORMAT_A8B8G8R8I:
561			{
562				signed char* abgr = (signed char*)element;
563
564				r = abgr[0];
565				g = abgr[1];
566				b = abgr[2];
567				a = abgr[3];
568			}
569			break;
570		case FORMAT_A8B8G8R8UI:
571			{
572				unsigned char* abgr = (unsigned char*)element;
573
574				r = abgr[0];
575				g = abgr[1];
576				b = abgr[2];
577				a = abgr[3];
578			}
579			break;
580		case FORMAT_X8B8G8R8I_SNORM:
581			{
582				signed char* bgr = (signed char*)element;
583
584				r = max(bgr[0] * (1.0f / 0x7F), -1.0f);
585				g = max(bgr[1] * (1.0f / 0x7F), -1.0f);
586				b = max(bgr[2] * (1.0f / 0x7F), -1.0f);
587			}
588			break;
589		case FORMAT_X8B8G8R8:
590		case FORMAT_SRGB8_X8:
591			{
592				unsigned int xbgr = *(unsigned int*)element;
593
594				b = (xbgr & 0x00FF0000) * (1.0f / 0x00FF0000);
595				g = (xbgr & 0x0000FF00) * (1.0f / 0x0000FF00);
596				r = (xbgr & 0x000000FF) * (1.0f / 0x000000FF);
597			}
598			break;
599		case FORMAT_X8B8G8R8I:
600			{
601				signed char* bgr = (signed char*)element;
602
603				r = bgr[0];
604				g = bgr[1];
605				b = bgr[2];
606			}
607			break;
608		case FORMAT_X8B8G8R8UI:
609			{
610				unsigned char* bgr = (unsigned char*)element;
611
612				r = bgr[0];
613				g = bgr[1];
614				b = bgr[2];
615			}
616			break;
617		case FORMAT_G8R8I_SNORM:
618			{
619				signed char* gr = (signed char*)element;
620
621				r = (gr[0] & 0xFF00) * (1.0f / 0xFF00);
622				g = (gr[1] & 0x00FF) * (1.0f / 0x00FF);
623			}
624			break;
625		case FORMAT_G8R8:
626			{
627				unsigned short gr = *(unsigned short*)element;
628
629				g = (gr & 0xFF00) * (1.0f / 0xFF00);
630				r = (gr & 0x00FF) * (1.0f / 0x00FF);
631			}
632			break;
633		case FORMAT_G8R8I:
634			{
635				signed char* gr = (signed char*)element;
636
637				r = gr[0];
638				g = gr[1];
639			}
640			break;
641		case FORMAT_G8R8UI:
642			{
643				unsigned char* gr = (unsigned char*)element;
644
645				r = gr[0];
646				g = gr[1];
647			}
648			break;
649		case FORMAT_R16I:
650			r = *((short*)element);
651			break;
652		case FORMAT_R16UI:
653			r = *((unsigned short*)element);
654			break;
655		case FORMAT_G16R16I:
656			{
657				short* gr = (short*)element;
658
659				r = gr[0];
660				g = gr[1];
661			}
662			break;
663		case FORMAT_G16R16:
664			{
665				unsigned int gr = *(unsigned int*)element;
666
667				g = (gr & 0xFFFF0000) * (1.0f / 0xFFFF0000);
668				r = (gr & 0x0000FFFF) * (1.0f / 0x0000FFFF);
669			}
670			break;
671		case FORMAT_G16R16UI:
672			{
673				unsigned short* gr = (unsigned short*)element;
674
675				r = gr[0];
676				g = gr[1];
677			}
678			break;
679		case FORMAT_A2R10G10B10:
680			{
681				unsigned int argb = *(unsigned int*)element;
682
683				a = (argb & 0xC0000000) * (1.0f / 0xC0000000);
684				r = (argb & 0x3FF00000) * (1.0f / 0x3FF00000);
685				g = (argb & 0x000FFC00) * (1.0f / 0x000FFC00);
686				b = (argb & 0x000003FF) * (1.0f / 0x000003FF);
687			}
688			break;
689		case FORMAT_A2B10G10R10:
690			{
691				unsigned int abgr = *(unsigned int*)element;
692
693				a = (abgr & 0xC0000000) * (1.0f / 0xC0000000);
694				b = (abgr & 0x3FF00000) * (1.0f / 0x3FF00000);
695				g = (abgr & 0x000FFC00) * (1.0f / 0x000FFC00);
696				r = (abgr & 0x000003FF) * (1.0f / 0x000003FF);
697			}
698			break;
699		case FORMAT_A16B16G16R16I:
700			{
701				short* abgr = (short*)element;
702
703				r = abgr[0];
704				g = abgr[1];
705				b = abgr[2];
706				a = abgr[3];
707			}
708			break;
709		case FORMAT_A16B16G16R16:
710			r = ((unsigned short*)element)[0] * (1.0f / 0xFFFF);
711			g = ((unsigned short*)element)[1] * (1.0f / 0xFFFF);
712			b = ((unsigned short*)element)[2] * (1.0f / 0xFFFF);
713			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
714			break;
715		case FORMAT_A16B16G16R16UI:
716			{
717				unsigned short* abgr = (unsigned short*)element;
718
719				r = abgr[0];
720				g = abgr[1];
721				b = abgr[2];
722				a = abgr[3];
723			}
724			break;
725		case FORMAT_X16B16G16R16I:
726			{
727				short* bgr = (short*)element;
728
729				r = bgr[0];
730				g = bgr[1];
731				b = bgr[2];
732			}
733			break;
734		case FORMAT_X16B16G16R16UI:
735			{
736				unsigned short* bgr = (unsigned short*)element;
737
738				r = bgr[0];
739				g = bgr[1];
740				b = bgr[2];
741			}
742			break;
743		case FORMAT_A32B32G32R32I:
744			{
745				int* abgr = (int*)element;
746
747				r = static_cast<float>(abgr[0]);
748				g = static_cast<float>(abgr[1]);
749				b = static_cast<float>(abgr[2]);
750				a = static_cast<float>(abgr[3]);
751			}
752			break;
753		case FORMAT_A32B32G32R32UI:
754			{
755				unsigned int* abgr = (unsigned int*)element;
756
757				r = static_cast<float>(abgr[0]);
758				g = static_cast<float>(abgr[1]);
759				b = static_cast<float>(abgr[2]);
760				a = static_cast<float>(abgr[3]);
761			}
762			break;
763		case FORMAT_X32B32G32R32I:
764			{
765				int* bgr = (int*)element;
766
767				r = static_cast<float>(bgr[0]);
768				g = static_cast<float>(bgr[1]);
769				b = static_cast<float>(bgr[2]);
770			}
771			break;
772		case FORMAT_X32B32G32R32UI:
773			{
774				unsigned int* bgr = (unsigned int*)element;
775
776				r = static_cast<float>(bgr[0]);
777				g = static_cast<float>(bgr[1]);
778				b = static_cast<float>(bgr[2]);
779			}
780			break;
781		case FORMAT_G32R32I:
782			{
783				int* gr = (int*)element;
784
785				r = static_cast<float>(gr[0]);
786				g = static_cast<float>(gr[1]);
787			}
788			break;
789		case FORMAT_G32R32UI:
790			{
791				unsigned int* gr = (unsigned int*)element;
792
793				r = static_cast<float>(gr[0]);
794				g = static_cast<float>(gr[1]);
795			}
796			break;
797		case FORMAT_R32I:
798			r = static_cast<float>(*((int*)element));
799			break;
800		case FORMAT_R32UI:
801			r = static_cast<float>(*((unsigned int*)element));
802			break;
803		case FORMAT_V8U8:
804			{
805				unsigned short vu = *(unsigned short*)element;
806
807				r = ((int)(vu & 0x00FF) << 24) * (1.0f / 0x7F000000);
808				g = ((int)(vu & 0xFF00) << 16) * (1.0f / 0x7F000000);
809			}
810			break;
811		case FORMAT_L6V5U5:
812			{
813				unsigned short lvu = *(unsigned short*)element;
814
815				r = ((int)(lvu & 0x001F) << 27) * (1.0f / 0x78000000);
816				g = ((int)(lvu & 0x03E0) << 22) * (1.0f / 0x78000000);
817				b = (lvu & 0xFC00) * (1.0f / 0xFC00);
818			}
819			break;
820		case FORMAT_Q8W8V8U8:
821			{
822				unsigned int qwvu = *(unsigned int*)element;
823
824				r = ((int)(qwvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
825				g = ((int)(qwvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
826				b = ((int)(qwvu & 0x00FF0000) << 8)  * (1.0f / 0x7F000000);
827				a = ((int)(qwvu & 0xFF000000) << 0)  * (1.0f / 0x7F000000);
828			}
829			break;
830		case FORMAT_X8L8V8U8:
831			{
832				unsigned int xlvu = *(unsigned int*)element;
833
834				r = ((int)(xlvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
835				g = ((int)(xlvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
836				b = (xlvu & 0x00FF0000) * (1.0f / 0x00FF0000);
837			}
838			break;
839		case FORMAT_R8G8B8:
840			r = ((unsigned char*)element)[2] * (1.0f / 0xFF);
841			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
842			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
843			break;
844		case FORMAT_B8G8R8:
845			r = ((unsigned char*)element)[0] * (1.0f / 0xFF);
846			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
847			b = ((unsigned char*)element)[2] * (1.0f / 0xFF);
848			break;
849		case FORMAT_V16U16:
850			{
851				unsigned int vu = *(unsigned int*)element;
852
853				r = ((int)(vu & 0x0000FFFF) << 16) * (1.0f / 0x7FFF0000);
854				g = ((int)(vu & 0xFFFF0000) << 0)  * (1.0f / 0x7FFF0000);
855			}
856			break;
857		case FORMAT_A2W10V10U10:
858			{
859				unsigned int awvu = *(unsigned int*)element;
860
861				r = ((int)(awvu & 0x000003FF) << 22) * (1.0f / 0x7FC00000);
862				g = ((int)(awvu & 0x000FFC00) << 12) * (1.0f / 0x7FC00000);
863				b = ((int)(awvu & 0x3FF00000) << 2)  * (1.0f / 0x7FC00000);
864				a = (awvu & 0xC0000000) * (1.0f / 0xC0000000);
865			}
866			break;
867		case FORMAT_A16W16V16U16:
868			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
869			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
870			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
871			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
872			break;
873		case FORMAT_Q16W16V16U16:
874			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
875			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
876			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
877			a = ((signed short*)element)[3] * (1.0f / 0x7FFF);
878			break;
879		case FORMAT_L8:
880			r =
881			g =
882			b = *(unsigned char*)element * (1.0f / 0xFF);
883			break;
884		case FORMAT_A4L4:
885			{
886				unsigned char al = *(unsigned char*)element;
887
888				r =
889				g =
890				b = (al & 0x0F) * (1.0f / 0x0F);
891				a = (al & 0xF0) * (1.0f / 0xF0);
892			}
893			break;
894		case FORMAT_L16:
895			r =
896			g =
897			b = *(unsigned short*)element * (1.0f / 0xFFFF);
898			break;
899		case FORMAT_A8L8:
900			r =
901			g =
902			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
903			a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
904			break;
905		case FORMAT_L16F:
906			r =
907			g =
908			b = *(half*)element;
909			break;
910		case FORMAT_A16L16F:
911			r =
912			g =
913			b = ((half*)element)[0];
914			a = ((half*)element)[1];
915			break;
916		case FORMAT_L32F:
917			r =
918			g =
919			b = *(float*)element;
920			break;
921		case FORMAT_A32L32F:
922			r =
923			g =
924			b = ((float*)element)[0];
925			a = ((float*)element)[1];
926			break;
927		case FORMAT_A16F:
928			a = *(half*)element;
929			break;
930		case FORMAT_R16F:
931			r = *(half*)element;
932			break;
933		case FORMAT_G16R16F:
934			r = ((half*)element)[0];
935			g = ((half*)element)[1];
936			break;
937		case FORMAT_B16G16R16F:
938			r = ((half*)element)[0];
939			g = ((half*)element)[1];
940			b = ((half*)element)[2];
941			break;
942		case FORMAT_A16B16G16R16F:
943			r = ((half*)element)[0];
944			g = ((half*)element)[1];
945			b = ((half*)element)[2];
946			a = ((half*)element)[3];
947			break;
948		case FORMAT_A32F:
949			a = *(float*)element;
950			break;
951		case FORMAT_R32F:
952			r = *(float*)element;
953			break;
954		case FORMAT_G32R32F:
955			r = ((float*)element)[0];
956			g = ((float*)element)[1];
957			break;
958		case FORMAT_X32B32G32R32F:
959		case FORMAT_B32G32R32F:
960			r = ((float*)element)[0];
961			g = ((float*)element)[1];
962			b = ((float*)element)[2];
963			break;
964		case FORMAT_A32B32G32R32F:
965			r = ((float*)element)[0];
966			g = ((float*)element)[1];
967			b = ((float*)element)[2];
968			a = ((float*)element)[3];
969			break;
970		case FORMAT_D32F:
971		case FORMAT_D32F_LOCKABLE:
972		case FORMAT_D32FS8_TEXTURE:
973		case FORMAT_D32FS8_SHADOW:
974			r = *(float*)element;
975			g = r;
976			b = r;
977			a = r;
978			break;
979		case FORMAT_D32F_COMPLEMENTARY:
980			r = 1.0f - *(float*)element;
981			g = r;
982			b = r;
983			a = r;
984			break;
985		case FORMAT_S8:
986			r = *(unsigned char*)element * (1.0f / 0xFF);
987			break;
988		default:
989			ASSERT(false);
990		}
991
992	//	if(sRGB)
993	//	{
994	//		r = sRGBtoLinear(r);
995	//		g = sRGBtoLinear(g);
996	//		b = sRGBtoLinear(b);
997	//	}
998
999		return Color<float>(r, g, b, a);
1000	}
1001
1002	Color<float> Surface::Buffer::sample(float x, float y, float z) const
1003	{
1004		x -= 0.5f;
1005		y -= 0.5f;
1006		z -= 0.5f;
1007
1008		int x0 = clamp((int)x, 0, width - 1);
1009		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
1010
1011		int y0 = clamp((int)y, 0, height - 1);
1012		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
1013
1014		int z0 = clamp((int)z, 0, depth - 1);
1015		int z1 = (z0 + 1 >= depth) ? z0 : z0 + 1;
1016
1017		Color<float> c000 = read(x0, y0, z0);
1018		Color<float> c100 = read(x1, y0, z0);
1019		Color<float> c010 = read(x0, y1, z0);
1020		Color<float> c110 = read(x1, y1, z0);
1021		Color<float> c001 = read(x0, y0, z1);
1022		Color<float> c101 = read(x1, y0, z1);
1023		Color<float> c011 = read(x0, y1, z1);
1024		Color<float> c111 = read(x1, y1, z1);
1025
1026		float fx = x - x0;
1027		float fy = y - y0;
1028		float fz = z - z0;
1029
1030		c000 *= (1 - fx) * (1 - fy) * (1 - fz);
1031		c100 *= fx * (1 - fy) * (1 - fz);
1032		c010 *= (1 - fx) * fy * (1 - fz);
1033		c110 *= fx * fy * (1 - fz);
1034		c001 *= (1 - fx) * (1 - fy) * fz;
1035		c101 *= fx * (1 - fy) * fz;
1036		c011 *= (1 - fx) * fy * fz;
1037		c111 *= fx * fy * fz;
1038
1039		return c000 + c100 + c010 + c110 + c001 + c101 + c011 + c111;
1040	}
1041
1042	Color<float> Surface::Buffer::sample(float x, float y) const
1043	{
1044		x -= 0.5f;
1045		y -= 0.5f;
1046
1047		int x0 = clamp((int)x, 0, width - 1);
1048		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
1049
1050		int y0 = clamp((int)y, 0, height - 1);
1051		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
1052
1053		Color<float> c00 = read(x0, y0);
1054		Color<float> c10 = read(x1, y0);
1055		Color<float> c01 = read(x0, y1);
1056		Color<float> c11 = read(x1, y1);
1057
1058		float fx = x - x0;
1059		float fy = y - y0;
1060
1061		c00 *= (1 - fx) * (1 - fy);
1062		c10 *= fx * (1 - fy);
1063		c01 *= (1 - fx) * fy;
1064		c11 *= fx * fy;
1065
1066		return c00 + c10 + c01 + c11;
1067	}
1068
1069	void *Surface::Buffer::lockRect(int x, int y, int z, Lock lock)
1070	{
1071		this->lock = lock;
1072
1073		switch(lock)
1074		{
1075		case LOCK_UNLOCKED:
1076		case LOCK_READONLY:
1077			break;
1078		case LOCK_WRITEONLY:
1079		case LOCK_READWRITE:
1080		case LOCK_DISCARD:
1081			dirty = true;
1082			break;
1083		default:
1084			ASSERT(false);
1085		}
1086
1087		if(buffer)
1088		{
1089			switch(format)
1090			{
1091			#if S3TC_SUPPORT
1092			case FORMAT_DXT1:
1093			#endif
1094			case FORMAT_ATI1:
1095			case FORMAT_ETC1:
1096			case FORMAT_R11_EAC:
1097			case FORMAT_SIGNED_R11_EAC:
1098			case FORMAT_RGB8_ETC2:
1099			case FORMAT_SRGB8_ETC2:
1100			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1101			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1102				return (unsigned char*)buffer + 8 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1103			case FORMAT_RG11_EAC:
1104			case FORMAT_SIGNED_RG11_EAC:
1105			case FORMAT_RGBA8_ETC2_EAC:
1106			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1107			case FORMAT_RGBA_ASTC_4x4_KHR:
1108			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1109				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1110			case FORMAT_RGBA_ASTC_5x4_KHR:
1111			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1112				return (unsigned char*)buffer + 16 * (x / 5) + (y / 4) * pitchB + z * sliceB;
1113			case FORMAT_RGBA_ASTC_5x5_KHR:
1114			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1115				return (unsigned char*)buffer + 16 * (x / 5) + (y / 5) * pitchB + z * sliceB;
1116			case FORMAT_RGBA_ASTC_6x5_KHR:
1117			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1118				return (unsigned char*)buffer + 16 * (x / 6) + (y / 5) * pitchB + z * sliceB;
1119			case FORMAT_RGBA_ASTC_6x6_KHR:
1120			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1121				return (unsigned char*)buffer + 16 * (x / 6) + (y / 6) * pitchB + z * sliceB;
1122			case FORMAT_RGBA_ASTC_8x5_KHR:
1123			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1124				return (unsigned char*)buffer + 16 * (x / 8) + (y / 5) * pitchB + z * sliceB;
1125			case FORMAT_RGBA_ASTC_8x6_KHR:
1126			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1127				return (unsigned char*)buffer + 16 * (x / 8) + (y / 6) * pitchB + z * sliceB;
1128			case FORMAT_RGBA_ASTC_8x8_KHR:
1129			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1130				return (unsigned char*)buffer + 16 * (x / 8) + (y / 8) * pitchB + z * sliceB;
1131			case FORMAT_RGBA_ASTC_10x5_KHR:
1132			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1133				return (unsigned char*)buffer + 16 * (x / 10) + (y / 5) * pitchB + z * sliceB;
1134			case FORMAT_RGBA_ASTC_10x6_KHR:
1135			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1136				return (unsigned char*)buffer + 16 * (x / 10) + (y / 6) * pitchB + z * sliceB;
1137			case FORMAT_RGBA_ASTC_10x8_KHR:
1138			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1139				return (unsigned char*)buffer + 16 * (x / 10) + (y / 8) * pitchB + z * sliceB;
1140			case FORMAT_RGBA_ASTC_10x10_KHR:
1141			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1142				return (unsigned char*)buffer + 16 * (x / 10) + (y / 10) * pitchB + z * sliceB;
1143			case FORMAT_RGBA_ASTC_12x10_KHR:
1144			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1145				return (unsigned char*)buffer + 16 * (x / 12) + (y / 10) * pitchB + z * sliceB;
1146			case FORMAT_RGBA_ASTC_12x12_KHR:
1147			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1148				return (unsigned char*)buffer + 16 * (x / 12) + (y / 12) * pitchB + z * sliceB;
1149			#if S3TC_SUPPORT
1150			case FORMAT_DXT3:
1151			case FORMAT_DXT5:
1152			#endif
1153			case FORMAT_ATI2:
1154				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1155			default:
1156				return (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
1157			}
1158		}
1159
1160		return 0;
1161	}
1162
1163	void Surface::Buffer::unlockRect()
1164	{
1165		lock = LOCK_UNLOCKED;
1166	}
1167
1168	Surface::Surface(int width, int height, int depth, Format format, void *pixels, int pitch, int slice) : lockable(true), renderTarget(false)
1169	{
1170		resource = new Resource(0);
1171		hasParent = false;
1172		ownExternal = false;
1173		depth = max(1, depth);
1174
1175		external.buffer = pixels;
1176		external.width = width;
1177		external.height = height;
1178		external.depth = depth;
1179		external.format = format;
1180		external.bytes = bytes(external.format);
1181		external.pitchB = pitch;
1182		external.pitchP = external.bytes ? pitch / external.bytes : 0;
1183		external.sliceB = slice;
1184		external.sliceP = external.bytes ? slice / external.bytes : 0;
1185		external.lock = LOCK_UNLOCKED;
1186		external.dirty = true;
1187
1188		internal.buffer = 0;
1189		internal.width = width;
1190		internal.height = height;
1191		internal.depth = depth;
1192		internal.format = selectInternalFormat(format);
1193		internal.bytes = bytes(internal.format);
1194		internal.pitchB = pitchB(internal.width, internal.format, false);
1195		internal.pitchP = pitchP(internal.width, internal.format, false);
1196		internal.sliceB = sliceB(internal.width, internal.height, internal.format, false);
1197		internal.sliceP = sliceP(internal.width, internal.height, internal.format, false);
1198		internal.lock = LOCK_UNLOCKED;
1199		internal.dirty = false;
1200
1201		stencil.buffer = 0;
1202		stencil.width = width;
1203		stencil.height = height;
1204		stencil.depth = depth;
1205		stencil.format = FORMAT_S8;
1206		stencil.bytes = bytes(stencil.format);
1207		stencil.pitchB = pitchB(stencil.width, stencil.format, false);
1208		stencil.pitchP = pitchP(stencil.width, stencil.format, false);
1209		stencil.sliceB = sliceB(stencil.width, stencil.height, stencil.format, false);
1210		stencil.sliceP = sliceP(stencil.width, stencil.height, stencil.format, false);
1211		stencil.lock = LOCK_UNLOCKED;
1212		stencil.dirty = false;
1213
1214		dirtyMipmaps = true;
1215		paletteUsed = 0;
1216	}
1217
1218	Surface::Surface(Resource *texture, int width, int height, int depth, Format format, bool lockable, bool renderTarget, int pitchPprovided) : lockable(lockable), renderTarget(renderTarget)
1219	{
1220		resource = texture ? texture : new Resource(0);
1221		hasParent = texture != 0;
1222		ownExternal = true;
1223		depth = max(1, depth);
1224
1225		external.buffer = 0;
1226		external.width = width;
1227		external.height = height;
1228		external.depth = depth;
1229		external.format = format;
1230		external.bytes = bytes(external.format);
1231		external.pitchB = pitchB(external.width, external.format, renderTarget && !texture);
1232		external.pitchP = pitchP(external.width, external.format, renderTarget && !texture);
1233		external.sliceB = sliceB(external.width, external.height, external.format, renderTarget && !texture);
1234		external.sliceP = sliceP(external.width, external.height, external.format, renderTarget && !texture);
1235		external.lock = LOCK_UNLOCKED;
1236		external.dirty = false;
1237
1238		internal.buffer = 0;
1239		internal.width = width;
1240		internal.height = height;
1241		internal.depth = depth;
1242		internal.format = selectInternalFormat(format);
1243		internal.bytes = bytes(internal.format);
1244		internal.pitchB = !pitchPprovided ? pitchB(internal.width, internal.format, renderTarget) : pitchPprovided * internal.bytes;
1245		internal.pitchP = !pitchPprovided ? pitchP(internal.width, internal.format, renderTarget) : pitchPprovided;
1246		internal.sliceB = sliceB(internal.width, internal.height, internal.format, renderTarget);
1247		internal.sliceP = sliceP(internal.width, internal.height, internal.format, renderTarget);
1248		internal.lock = LOCK_UNLOCKED;
1249		internal.dirty = false;
1250
1251		stencil.buffer = 0;
1252		stencil.width = width;
1253		stencil.height = height;
1254		stencil.depth = depth;
1255		stencil.format = FORMAT_S8;
1256		stencil.bytes = bytes(stencil.format);
1257		stencil.pitchB = pitchB(stencil.width, stencil.format, renderTarget);
1258		stencil.pitchP = pitchP(stencil.width, stencil.format, renderTarget);
1259		stencil.sliceB = sliceB(stencil.width, stencil.height, stencil.format, renderTarget);
1260		stencil.sliceP = sliceP(stencil.width, stencil.height, stencil.format, renderTarget);
1261		stencil.lock = LOCK_UNLOCKED;
1262		stencil.dirty = false;
1263
1264		dirtyMipmaps = true;
1265		paletteUsed = 0;
1266	}
1267
1268	Surface::~Surface()
1269	{
1270		// Synchronize so we can deallocate the buffers below
1271		resource->lock(DESTRUCT);
1272		resource->unlock();
1273
1274		if(!hasParent)
1275		{
1276			resource->destruct();
1277		}
1278
1279		if(ownExternal)
1280		{
1281			deallocate(external.buffer);
1282		}
1283
1284		if(internal.buffer != external.buffer)
1285		{
1286			deallocate(internal.buffer);
1287		}
1288
1289		deallocate(stencil.buffer);
1290
1291		external.buffer = 0;
1292		internal.buffer = 0;
1293		stencil.buffer = 0;
1294	}
1295
1296	void *Surface::lockExternal(int x, int y, int z, Lock lock, Accessor client)
1297	{
1298		resource->lock(client);
1299
1300		if(!external.buffer)
1301		{
1302			if(internal.buffer && identicalFormats())
1303			{
1304				external.buffer = internal.buffer;
1305			}
1306			else
1307			{
1308				external.buffer = allocateBuffer(external.width, external.height, external.depth, external.format);
1309			}
1310		}
1311
1312		if(internal.dirty)
1313		{
1314			if(lock != LOCK_DISCARD)
1315			{
1316				update(external, internal);
1317			}
1318
1319			internal.dirty = false;
1320		}
1321
1322		switch(lock)
1323		{
1324		case LOCK_READONLY:
1325			break;
1326		case LOCK_WRITEONLY:
1327		case LOCK_READWRITE:
1328		case LOCK_DISCARD:
1329			dirtyMipmaps = true;
1330			break;
1331		default:
1332			ASSERT(false);
1333		}
1334
1335		return external.lockRect(x, y, z, lock);
1336	}
1337
1338	void Surface::unlockExternal()
1339	{
1340		resource->unlock();
1341
1342		external.unlockRect();
1343	}
1344
1345	void *Surface::lockInternal(int x, int y, int z, Lock lock, Accessor client)
1346	{
1347		if(lock != LOCK_UNLOCKED)
1348		{
1349			resource->lock(client);
1350		}
1351
1352		if(!internal.buffer)
1353		{
1354			if(external.buffer && identicalFormats())
1355			{
1356				internal.buffer = external.buffer;
1357			}
1358			else
1359			{
1360				internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.format);
1361			}
1362		}
1363
1364		// FIXME: WHQL requires conversion to lower external precision and back
1365		if(logPrecision >= WHQL)
1366		{
1367			if(internal.dirty && renderTarget && internal.format != external.format)
1368			{
1369				if(lock != LOCK_DISCARD)
1370				{
1371					switch(external.format)
1372					{
1373					case FORMAT_R3G3B2:
1374					case FORMAT_A8R3G3B2:
1375					case FORMAT_A1R5G5B5:
1376					case FORMAT_A2R10G10B10:
1377					case FORMAT_A2B10G10R10:
1378						lockExternal(0, 0, 0, LOCK_READWRITE, client);
1379						unlockExternal();
1380						break;
1381					default:
1382						// Difference passes WHQL
1383						break;
1384					}
1385				}
1386			}
1387		}
1388
1389		if(external.dirty || (isPalette(external.format) && paletteUsed != Surface::paletteID))
1390		{
1391			if(lock != LOCK_DISCARD)
1392			{
1393				update(internal, external);
1394			}
1395
1396			external.dirty = false;
1397			paletteUsed = Surface::paletteID;
1398		}
1399
1400		switch(lock)
1401		{
1402		case LOCK_UNLOCKED:
1403		case LOCK_READONLY:
1404			break;
1405		case LOCK_WRITEONLY:
1406		case LOCK_READWRITE:
1407		case LOCK_DISCARD:
1408			dirtyMipmaps = true;
1409			break;
1410		default:
1411			ASSERT(false);
1412		}
1413
1414		if(lock == LOCK_READONLY && client == PUBLIC)
1415		{
1416			resolve();
1417		}
1418
1419		return internal.lockRect(x, y, z, lock);
1420	}
1421
1422	void Surface::unlockInternal()
1423	{
1424		resource->unlock();
1425
1426		internal.unlockRect();
1427	}
1428
1429	void *Surface::lockStencil(int front, Accessor client)
1430	{
1431		resource->lock(client);
1432
1433		if(!stencil.buffer)
1434		{
1435			stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.format);
1436		}
1437
1438		return stencil.lockRect(0, 0, front, LOCK_READWRITE);   // FIXME
1439	}
1440
1441	void Surface::unlockStencil()
1442	{
1443		resource->unlock();
1444
1445		stencil.unlockRect();
1446	}
1447
1448	int Surface::bytes(Format format)
1449	{
1450		switch(format)
1451		{
1452		case FORMAT_NULL:				return 0;
1453		case FORMAT_P8:					return 1;
1454		case FORMAT_A8P8:				return 2;
1455		case FORMAT_A8:					return 1;
1456		case FORMAT_R8I:				return 1;
1457		case FORMAT_R8:					return 1;
1458		case FORMAT_R3G3B2:				return 1;
1459		case FORMAT_R16I:				return 2;
1460		case FORMAT_R16UI:				return 2;
1461		case FORMAT_A8R3G3B2:			return 2;
1462		case FORMAT_R5G6B5:				return 2;
1463		case FORMAT_A1R5G5B5:			return 2;
1464		case FORMAT_X1R5G5B5:			return 2;
1465		case FORMAT_R5G5B5A1:           return 2;
1466		case FORMAT_X4R4G4B4:			return 2;
1467		case FORMAT_A4R4G4B4:			return 2;
1468		case FORMAT_R4G4B4A4:           return 2;
1469		case FORMAT_R8G8B8:				return 3;
1470		case FORMAT_B8G8R8:             return 3;
1471		case FORMAT_R32I:				return 4;
1472		case FORMAT_R32UI:				return 4;
1473		case FORMAT_X8R8G8B8:			return 4;
1474	//	case FORMAT_X8G8R8B8Q:			return 4;
1475		case FORMAT_A8R8G8B8:			return 4;
1476	//	case FORMAT_A8G8R8B8Q:			return 4;
1477		case FORMAT_X8B8G8R8I:			return 4;
1478		case FORMAT_X8B8G8R8:			return 4;
1479		case FORMAT_SRGB8_X8:			return 4;
1480		case FORMAT_SRGB8_A8:			return 4;
1481		case FORMAT_A8B8G8R8I:			return 4;
1482		case FORMAT_R8UI:				return 1;
1483		case FORMAT_G8R8UI:				return 2;
1484		case FORMAT_X8B8G8R8UI:			return 4;
1485		case FORMAT_A8B8G8R8UI:			return 4;
1486		case FORMAT_A8B8G8R8:			return 4;
1487		case FORMAT_R8I_SNORM:			return 1;
1488		case FORMAT_G8R8I_SNORM:		return 2;
1489		case FORMAT_X8B8G8R8I_SNORM:	return 4;
1490		case FORMAT_A8B8G8R8I_SNORM:	return 4;
1491		case FORMAT_A2R10G10B10:		return 4;
1492		case FORMAT_A2B10G10R10:		return 4;
1493		case FORMAT_G8R8I:				return 2;
1494		case FORMAT_G8R8:				return 2;
1495		case FORMAT_G16R16I:			return 4;
1496		case FORMAT_G16R16UI:			return 4;
1497		case FORMAT_G16R16:				return 4;
1498		case FORMAT_G32R32I:			return 8;
1499		case FORMAT_G32R32UI:			return 8;
1500		case FORMAT_X16B16G16R16I:		return 8;
1501		case FORMAT_X16B16G16R16UI:		return 8;
1502		case FORMAT_A16B16G16R16I:		return 8;
1503		case FORMAT_A16B16G16R16UI:		return 8;
1504		case FORMAT_A16B16G16R16:		return 8;
1505		case FORMAT_X32B32G32R32I:		return 16;
1506		case FORMAT_X32B32G32R32UI:		return 16;
1507		case FORMAT_A32B32G32R32I:		return 16;
1508		case FORMAT_A32B32G32R32UI:		return 16;
1509		// Compressed formats
1510		#if S3TC_SUPPORT
1511		case FORMAT_DXT1:				return 2;   // Column of four pixels
1512		case FORMAT_DXT3:				return 4;   // Column of four pixels
1513		case FORMAT_DXT5:				return 4;   // Column of four pixels
1514		#endif
1515		case FORMAT_ATI1:				return 2;   // Column of four pixels
1516		case FORMAT_ATI2:				return 4;   // Column of four pixels
1517		case FORMAT_ETC1:				return 2;   // Column of four pixels
1518		case FORMAT_R11_EAC:			return 2;
1519		case FORMAT_SIGNED_R11_EAC:		return 2;
1520		case FORMAT_RG11_EAC:			return 4;
1521		case FORMAT_SIGNED_RG11_EAC:	return 4;
1522		case FORMAT_RGB8_ETC2:			return 2;
1523		case FORMAT_SRGB8_ETC2:			return 2;
1524		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
1525		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
1526		case FORMAT_RGBA8_ETC2_EAC:			return 4;
1527		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:	return 4;
1528		case FORMAT_RGBA_ASTC_4x4_KHR:
1529		case FORMAT_RGBA_ASTC_5x4_KHR:
1530		case FORMAT_RGBA_ASTC_5x5_KHR:
1531		case FORMAT_RGBA_ASTC_6x5_KHR:
1532		case FORMAT_RGBA_ASTC_6x6_KHR:
1533		case FORMAT_RGBA_ASTC_8x5_KHR:
1534		case FORMAT_RGBA_ASTC_8x6_KHR:
1535		case FORMAT_RGBA_ASTC_8x8_KHR:
1536		case FORMAT_RGBA_ASTC_10x5_KHR:
1537		case FORMAT_RGBA_ASTC_10x6_KHR:
1538		case FORMAT_RGBA_ASTC_10x8_KHR:
1539		case FORMAT_RGBA_ASTC_10x10_KHR:
1540		case FORMAT_RGBA_ASTC_12x10_KHR:
1541		case FORMAT_RGBA_ASTC_12x12_KHR:
1542		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1543		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1544		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1545		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1546		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1547		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1548		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1549		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1550		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1551		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1552		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1553		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1554		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1555		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: return 0; // FIXME
1556		// Bumpmap formats
1557		case FORMAT_V8U8:				return 2;
1558		case FORMAT_L6V5U5:				return 2;
1559		case FORMAT_Q8W8V8U8:			return 4;
1560		case FORMAT_X8L8V8U8:			return 4;
1561		case FORMAT_A2W10V10U10:		return 4;
1562		case FORMAT_V16U16:				return 4;
1563		case FORMAT_A16W16V16U16:		return 8;
1564		case FORMAT_Q16W16V16U16:		return 8;
1565		// Luminance formats
1566		case FORMAT_L8:					return 1;
1567		case FORMAT_A4L4:				return 1;
1568		case FORMAT_L16:				return 2;
1569		case FORMAT_A8L8:				return 2;
1570		case FORMAT_L16F:               return 2;
1571		case FORMAT_A16L16F:            return 4;
1572		case FORMAT_L32F:               return 4;
1573		case FORMAT_A32L32F:            return 8;
1574		// Floating-point formats
1575		case FORMAT_A16F:				return 2;
1576		case FORMAT_R16F:				return 2;
1577		case FORMAT_G16R16F:			return 4;
1578		case FORMAT_B16G16R16F:			return 6;
1579		case FORMAT_A16B16G16R16F:		return 8;
1580		case FORMAT_A32F:				return 4;
1581		case FORMAT_R32F:				return 4;
1582		case FORMAT_G32R32F:			return 8;
1583		case FORMAT_B32G32R32F:			return 12;
1584		case FORMAT_X32B32G32R32F:		return 16;
1585		case FORMAT_A32B32G32R32F:		return 16;
1586		// Depth/stencil formats
1587		case FORMAT_D16:				return 2;
1588		case FORMAT_D32:				return 4;
1589		case FORMAT_D24X8:				return 4;
1590		case FORMAT_D24S8:				return 4;
1591		case FORMAT_D24FS8:				return 4;
1592		case FORMAT_D32F:				return 4;
1593		case FORMAT_D32F_COMPLEMENTARY:	return 4;
1594		case FORMAT_D32F_LOCKABLE:		return 4;
1595		case FORMAT_D32FS8_TEXTURE:		return 4;
1596		case FORMAT_D32FS8_SHADOW:		return 4;
1597		case FORMAT_DF24S8:				return 4;
1598		case FORMAT_DF16S8:				return 2;
1599		case FORMAT_INTZ:				return 4;
1600		case FORMAT_S8:					return 1;
1601		case FORMAT_YV12_BT601:         return 1;   // Y plane only
1602		case FORMAT_YV12_BT709:         return 1;   // Y plane only
1603		case FORMAT_YV12_JFIF:          return 1;   // Y plane only
1604		default:
1605			ASSERT(false);
1606		}
1607
1608		return 0;
1609	}
1610
1611	int Surface::pitchB(int width, Format format, bool target)
1612	{
1613		if(target || isDepth(format) || isStencil(format))
1614		{
1615			width = align(width, 2);
1616		}
1617
1618		switch(format)
1619		{
1620		#if S3TC_SUPPORT
1621		case FORMAT_DXT1:
1622		#endif
1623		case FORMAT_ETC1:
1624		case FORMAT_R11_EAC:
1625		case FORMAT_SIGNED_R11_EAC:
1626		case FORMAT_RGB8_ETC2:
1627		case FORMAT_SRGB8_ETC2:
1628		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1629		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1630			return 8 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per 4 rows
1631		case FORMAT_RG11_EAC:
1632		case FORMAT_SIGNED_RG11_EAC:
1633		case FORMAT_RGBA8_ETC2_EAC:
1634		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1635		case FORMAT_RGBA_ASTC_4x4_KHR:
1636		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1637			return 16 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per 4 rows
1638		case FORMAT_RGBA_ASTC_5x4_KHR:
1639		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1640		case FORMAT_RGBA_ASTC_5x5_KHR:
1641		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1642			return 16 * ((width + 4) / 5);
1643		case FORMAT_RGBA_ASTC_6x5_KHR:
1644		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1645		case FORMAT_RGBA_ASTC_6x6_KHR:
1646		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1647			return 16 * ((width + 5) / 6);
1648		case FORMAT_RGBA_ASTC_8x5_KHR:
1649		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1650		case FORMAT_RGBA_ASTC_8x6_KHR:
1651		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1652		case FORMAT_RGBA_ASTC_8x8_KHR:
1653		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1654			return 16 * ((width + 7) / 8);
1655		case FORMAT_RGBA_ASTC_10x5_KHR:
1656		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1657		case FORMAT_RGBA_ASTC_10x6_KHR:
1658		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1659		case FORMAT_RGBA_ASTC_10x8_KHR:
1660		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1661		case FORMAT_RGBA_ASTC_10x10_KHR:
1662		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1663			return 16 * ((width + 9) / 10);
1664		case FORMAT_RGBA_ASTC_12x10_KHR:
1665		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1666		case FORMAT_RGBA_ASTC_12x12_KHR:
1667		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1668			return 16 * ((width + 11) / 12);
1669		#if S3TC_SUPPORT
1670		case FORMAT_DXT3:
1671		case FORMAT_DXT5:
1672			return 16 * ((width + 3) / 4);   // 128 bit per 4x4 block, computed per 4 rows
1673		#endif
1674		case FORMAT_ATI1:
1675			return 2 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per row
1676		case FORMAT_ATI2:
1677			return 4 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per row
1678		case FORMAT_YV12_BT601:
1679		case FORMAT_YV12_BT709:
1680		case FORMAT_YV12_JFIF:
1681			return align(width, 16);
1682		default:
1683			return bytes(format) * width;
1684		}
1685	}
1686
1687	int Surface::pitchP(int width, Format format, bool target)
1688	{
1689		int B = bytes(format);
1690
1691		return B > 0 ? pitchB(width, format, target) / B : 0;
1692	}
1693
1694	int Surface::sliceB(int width, int height, Format format, bool target)
1695	{
1696		if(target || isDepth(format) || isStencil(format))
1697		{
1698			height = ((height + 1) & ~1);
1699		}
1700
1701		switch(format)
1702		{
1703		#if S3TC_SUPPORT
1704		case FORMAT_DXT1:
1705		case FORMAT_DXT3:
1706		case FORMAT_DXT5:
1707		#endif
1708		case FORMAT_ETC1:
1709		case FORMAT_R11_EAC:
1710		case FORMAT_SIGNED_R11_EAC:
1711		case FORMAT_RG11_EAC:
1712		case FORMAT_SIGNED_RG11_EAC:
1713		case FORMAT_RGB8_ETC2:
1714		case FORMAT_SRGB8_ETC2:
1715		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1716		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1717		case FORMAT_RGBA8_ETC2_EAC:
1718		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1719		case FORMAT_RGBA_ASTC_4x4_KHR:
1720		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1721		case FORMAT_RGBA_ASTC_5x4_KHR:
1722		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1723			return pitchB(width, format, target) * ((height + 3) / 4);   // Pitch computed per 4 rows
1724		case FORMAT_RGBA_ASTC_5x5_KHR:
1725		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1726		case FORMAT_RGBA_ASTC_6x5_KHR:
1727		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1728		case FORMAT_RGBA_ASTC_8x5_KHR:
1729		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1730		case FORMAT_RGBA_ASTC_10x5_KHR:
1731		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1732			return pitchB(width, format, target) * ((height + 4) / 5);   // Pitch computed per 5 rows
1733		case FORMAT_RGBA_ASTC_6x6_KHR:
1734		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1735		case FORMAT_RGBA_ASTC_8x6_KHR:
1736		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1737		case FORMAT_RGBA_ASTC_10x6_KHR:
1738		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1739			return pitchB(width, format, target) * ((height + 5) / 6);   // Pitch computed per 6 rows
1740		case FORMAT_RGBA_ASTC_8x8_KHR:
1741		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1742		case FORMAT_RGBA_ASTC_10x8_KHR:
1743		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1744			return pitchB(width, format, target) * ((height + 7) / 8);   // Pitch computed per 8 rows
1745		case FORMAT_RGBA_ASTC_10x10_KHR:
1746		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1747		case FORMAT_RGBA_ASTC_12x10_KHR:
1748		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1749			return pitchB(width, format, target) * ((height + 9) / 10);   // Pitch computed per 10 rows
1750		case FORMAT_RGBA_ASTC_12x12_KHR:
1751		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1752			return pitchB(width, format, target) * ((height + 11) / 12);   // Pitch computed per 12 rows
1753		case FORMAT_ATI1:
1754		case FORMAT_ATI2:
1755		default:
1756			return pitchB(width, format, target) * height;   // Pitch computed per row
1757		}
1758	}
1759
1760	int Surface::sliceP(int width, int height, Format format, bool target)
1761	{
1762		int B = bytes(format);
1763
1764		return B > 0 ? sliceB(width, height, format, target) / B : 0;
1765	}
1766
1767	void Surface::update(Buffer &destination, Buffer &source)
1768	{
1769	//	ASSERT(source.lock != LOCK_UNLOCKED);
1770	//	ASSERT(destination.lock != LOCK_UNLOCKED);
1771
1772		if(destination.buffer != source.buffer)
1773		{
1774			ASSERT(source.dirty && !destination.dirty);
1775
1776			switch(source.format)
1777			{
1778			case FORMAT_R8G8B8:		decodeR8G8B8(destination, source);		break;   // FIXME: Check destination format
1779			case FORMAT_X1R5G5B5:	decodeX1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1780			case FORMAT_A1R5G5B5:	decodeA1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1781			case FORMAT_X4R4G4B4:	decodeX4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1782			case FORMAT_A4R4G4B4:	decodeA4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1783			case FORMAT_P8:			decodeP8(destination, source);			break;   // FIXME: Check destination format
1784			#if S3TC_SUPPORT
1785			case FORMAT_DXT1:		decodeDXT1(destination, source);		break;   // FIXME: Check destination format
1786			case FORMAT_DXT3:		decodeDXT3(destination, source);		break;   // FIXME: Check destination format
1787			case FORMAT_DXT5:		decodeDXT5(destination, source);		break;   // FIXME: Check destination format
1788			#endif
1789			case FORMAT_ATI1:		decodeATI1(destination, source);		break;   // FIXME: Check destination format
1790			case FORMAT_ATI2:		decodeATI2(destination, source);		break;   // FIXME: Check destination format
1791			case FORMAT_R11_EAC:         decodeEAC(destination, source, 1, false); break; // FIXME: Check destination format
1792			case FORMAT_SIGNED_R11_EAC:  decodeEAC(destination, source, 1, true);  break; // FIXME: Check destination format
1793			case FORMAT_RG11_EAC:        decodeEAC(destination, source, 2, false); break; // FIXME: Check destination format
1794			case FORMAT_SIGNED_RG11_EAC: decodeEAC(destination, source, 2, true);  break; // FIXME: Check destination format
1795			case FORMAT_ETC1:
1796			case FORMAT_RGB8_ETC2:                      decodeETC2(destination, source, 0, false); break; // FIXME: Check destination format
1797			case FORMAT_SRGB8_ETC2:                     decodeETC2(destination, source, 0, true);  break; // FIXME: Check destination format
1798			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:  decodeETC2(destination, source, 1, false); break; // FIXME: Check destination format
1799			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2: decodeETC2(destination, source, 1, true);  break; // FIXME: Check destination format
1800			case FORMAT_RGBA8_ETC2_EAC:                 decodeETC2(destination, source, 8, false); break; // FIXME: Check destination format
1801			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:          decodeETC2(destination, source, 8, true);  break; // FIXME: Check destination format
1802			case FORMAT_RGBA_ASTC_4x4_KHR:           decodeASTC(destination, source, 4,  4,  1, false); break; // FIXME: Check destination format
1803			case FORMAT_RGBA_ASTC_5x4_KHR:           decodeASTC(destination, source, 5,  4,  1, false); break; // FIXME: Check destination format
1804			case FORMAT_RGBA_ASTC_5x5_KHR:           decodeASTC(destination, source, 5,  5,  1, false); break; // FIXME: Check destination format
1805			case FORMAT_RGBA_ASTC_6x5_KHR:           decodeASTC(destination, source, 6,  5,  1, false); break; // FIXME: Check destination format
1806			case FORMAT_RGBA_ASTC_6x6_KHR:           decodeASTC(destination, source, 6,  6,  1, false); break; // FIXME: Check destination format
1807			case FORMAT_RGBA_ASTC_8x5_KHR:           decodeASTC(destination, source, 8,  5,  1, false); break; // FIXME: Check destination format
1808			case FORMAT_RGBA_ASTC_8x6_KHR:           decodeASTC(destination, source, 8,  6,  1, false); break; // FIXME: Check destination format
1809			case FORMAT_RGBA_ASTC_8x8_KHR:           decodeASTC(destination, source, 8,  8,  1, false); break; // FIXME: Check destination format
1810			case FORMAT_RGBA_ASTC_10x5_KHR:          decodeASTC(destination, source, 10, 5,  1, false); break; // FIXME: Check destination format
1811			case FORMAT_RGBA_ASTC_10x6_KHR:          decodeASTC(destination, source, 10, 6,  1, false); break; // FIXME: Check destination format
1812			case FORMAT_RGBA_ASTC_10x8_KHR:          decodeASTC(destination, source, 10, 8,  1, false); break; // FIXME: Check destination format
1813			case FORMAT_RGBA_ASTC_10x10_KHR:         decodeASTC(destination, source, 10, 10, 1, false); break; // FIXME: Check destination format
1814			case FORMAT_RGBA_ASTC_12x10_KHR:         decodeASTC(destination, source, 12, 10, 1, false); break; // FIXME: Check destination format
1815			case FORMAT_RGBA_ASTC_12x12_KHR:         decodeASTC(destination, source, 12, 12, 1, false); break; // FIXME: Check destination format
1816			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:   decodeASTC(destination, source, 4,  4,  1, true);  break; // FIXME: Check destination format
1817			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:   decodeASTC(destination, source, 5,  4,  1, true);  break; // FIXME: Check destination format
1818			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:   decodeASTC(destination, source, 5,  5,  1, true);  break; // FIXME: Check destination format
1819			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:   decodeASTC(destination, source, 6,  5,  1, true);  break; // FIXME: Check destination format
1820			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:   decodeASTC(destination, source, 6,  6,  1, true);  break; // FIXME: Check destination format
1821			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:   decodeASTC(destination, source, 8,  5,  1, true);  break; // FIXME: Check destination format
1822			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:   decodeASTC(destination, source, 8,  6,  1, true);  break; // FIXME: Check destination format
1823			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:   decodeASTC(destination, source, 8,  8,  1, true);  break; // FIXME: Check destination format
1824			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:  decodeASTC(destination, source, 10, 5,  1, true);  break; // FIXME: Check destination format
1825			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:  decodeASTC(destination, source, 10, 6,  1, true);  break; // FIXME: Check destination format
1826			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:  decodeASTC(destination, source, 10, 8,  1, true);  break; // FIXME: Check destination format
1827			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR: decodeASTC(destination, source, 10, 10, 1, true);  break; // FIXME: Check destination format
1828			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR: decodeASTC(destination, source, 12, 10, 1, true);  break; // FIXME: Check destination format
1829			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: decodeASTC(destination, source, 12, 12, 1, true);  break; // FIXME: Check destination format
1830			default:				genericUpdate(destination, source);		break;
1831			}
1832		}
1833	}
1834
1835	void Surface::genericUpdate(Buffer &destination, Buffer &source)
1836	{
1837		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1838		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1839
1840		int depth = min(destination.depth, source.depth);
1841		int height = min(destination.height, source.height);
1842		int width = min(destination.width, source.width);
1843		int rowBytes = width * source.bytes;
1844
1845		for(int z = 0; z < depth; z++)
1846		{
1847			unsigned char *sourceRow = sourceSlice;
1848			unsigned char *destinationRow = destinationSlice;
1849
1850			for(int y = 0; y < height; y++)
1851			{
1852				if(source.format == destination.format)
1853				{
1854					memcpy(destinationRow, sourceRow, rowBytes);
1855				}
1856				else
1857				{
1858					unsigned char *sourceElement = sourceRow;
1859					unsigned char *destinationElement = destinationRow;
1860
1861					for(int x = 0; x < width; x++)
1862					{
1863						Color<float> color = source.read(sourceElement);
1864						destination.write(destinationElement, color);
1865
1866						sourceElement += source.bytes;
1867						destinationElement += destination.bytes;
1868					}
1869				}
1870
1871				sourceRow += source.pitchB;
1872				destinationRow += destination.pitchB;
1873			}
1874
1875			sourceSlice += source.sliceB;
1876			destinationSlice += destination.sliceB;
1877		}
1878	}
1879
1880	void Surface::decodeR8G8B8(Buffer &destination, const Buffer &source)
1881	{
1882		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1883		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1884
1885		for(int z = 0; z < destination.depth && z < source.depth; z++)
1886		{
1887			unsigned char *sourceRow = sourceSlice;
1888			unsigned char *destinationRow = destinationSlice;
1889
1890			for(int y = 0; y < destination.height && y < source.height; y++)
1891			{
1892				unsigned char *sourceElement = sourceRow;
1893				unsigned char *destinationElement = destinationRow;
1894
1895				for(int x = 0; x < destination.width && x < source.width; x++)
1896				{
1897					unsigned int b = sourceElement[0];
1898					unsigned int g = sourceElement[1];
1899					unsigned int r = sourceElement[2];
1900
1901					*(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0);
1902
1903					sourceElement += source.bytes;
1904					destinationElement += destination.bytes;
1905				}
1906
1907				sourceRow += source.pitchB;
1908				destinationRow += destination.pitchB;
1909			}
1910
1911			sourceSlice += source.sliceB;
1912			destinationSlice += destination.sliceB;
1913		}
1914	}
1915
1916	void Surface::decodeX1R5G5B5(Buffer &destination, const Buffer &source)
1917	{
1918		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1919		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1920
1921		for(int z = 0; z < destination.depth && z < source.depth; z++)
1922		{
1923			unsigned char *sourceRow = sourceSlice;
1924			unsigned char *destinationRow = destinationSlice;
1925
1926			for(int y = 0; y < destination.height && y < source.height; y++)
1927			{
1928				unsigned char *sourceElement = sourceRow;
1929				unsigned char *destinationElement = destinationRow;
1930
1931				for(int x = 0; x < destination.width && x < source.width; x++)
1932				{
1933					unsigned int xrgb = *(unsigned short*)sourceElement;
1934
1935					unsigned int r = (((xrgb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
1936					unsigned int g = (((xrgb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
1937					unsigned int b = (((xrgb & 0x001F) * 2106  + 0x80) >> 8);
1938
1939					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
1940
1941					sourceElement += source.bytes;
1942					destinationElement += destination.bytes;
1943				}
1944
1945				sourceRow += source.pitchB;
1946				destinationRow += destination.pitchB;
1947			}
1948
1949			sourceSlice += source.sliceB;
1950			destinationSlice += destination.sliceB;
1951		}
1952	}
1953
1954	void Surface::decodeA1R5G5B5(Buffer &destination, const Buffer &source)
1955	{
1956		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1957		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1958
1959		for(int z = 0; z < destination.depth && z < source.depth; z++)
1960		{
1961			unsigned char *sourceRow = sourceSlice;
1962			unsigned char *destinationRow = destinationSlice;
1963
1964			for(int y = 0; y < destination.height && y < source.height; y++)
1965			{
1966				unsigned char *sourceElement = sourceRow;
1967				unsigned char *destinationElement = destinationRow;
1968
1969				for(int x = 0; x < destination.width && x < source.width; x++)
1970				{
1971					unsigned int argb = *(unsigned short*)sourceElement;
1972
1973					unsigned int a =   (argb & 0x8000) * 130560;
1974					unsigned int r = (((argb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
1975					unsigned int g = (((argb & 0x03E0) * 16846  + 0x8000) >> 8) & 0x0000FF00;
1976					unsigned int b = (((argb & 0x001F) * 2106   + 0x80) >> 8);
1977
1978					*(unsigned int*)destinationElement = a | r | g | b;
1979
1980					sourceElement += source.bytes;
1981					destinationElement += destination.bytes;
1982				}
1983
1984				sourceRow += source.pitchB;
1985				destinationRow += destination.pitchB;
1986			}
1987
1988			sourceSlice += source.sliceB;
1989			destinationSlice += destination.sliceB;
1990		}
1991	}
1992
1993	void Surface::decodeX4R4G4B4(Buffer &destination, const Buffer &source)
1994	{
1995		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1996		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1997
1998		for(int z = 0; z < destination.depth && z < source.depth; z++)
1999		{
2000			unsigned char *sourceRow = sourceSlice;
2001			unsigned char *destinationRow = destinationSlice;
2002
2003			for(int y = 0; y < destination.height && y < source.height; y++)
2004			{
2005				unsigned char *sourceElement = sourceRow;
2006				unsigned char *destinationElement = destinationRow;
2007
2008				for(int x = 0; x < destination.width && x < source.width; x++)
2009				{
2010					unsigned int xrgb = *(unsigned short*)sourceElement;
2011
2012					unsigned int r = ((xrgb & 0x0F00) * 0x00001100) & 0x00FF0000;
2013					unsigned int g = ((xrgb & 0x00F0) * 0x00000110) & 0x0000FF00;
2014					unsigned int b =  (xrgb & 0x000F) * 0x00000011;
2015
2016					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
2017
2018					sourceElement += source.bytes;
2019					destinationElement += destination.bytes;
2020				}
2021
2022				sourceRow += source.pitchB;
2023				destinationRow += destination.pitchB;
2024			}
2025
2026			sourceSlice += source.sliceB;
2027			destinationSlice += destination.sliceB;
2028		}
2029	}
2030
2031	void Surface::decodeA4R4G4B4(Buffer &destination, const Buffer &source)
2032	{
2033		unsigned char *sourceSlice = (unsigned char*)source.buffer;
2034		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
2035
2036		for(int z = 0; z < destination.depth && z < source.depth; z++)
2037		{
2038			unsigned char *sourceRow = sourceSlice;
2039			unsigned char *destinationRow = destinationSlice;
2040
2041			for(int y = 0; y < destination.height && y < source.height; y++)
2042			{
2043				unsigned char *sourceElement = sourceRow;
2044				unsigned char *destinationElement = destinationRow;
2045
2046				for(int x = 0; x < destination.width && x < source.width; x++)
2047				{
2048					unsigned int argb = *(unsigned short*)sourceElement;
2049
2050					unsigned int a = ((argb & 0xF000) * 0x00011000) & 0xFF000000;
2051					unsigned int r = ((argb & 0x0F00) * 0x00001100) & 0x00FF0000;
2052					unsigned int g = ((argb & 0x00F0) * 0x00000110) & 0x0000FF00;
2053					unsigned int b =  (argb & 0x000F) * 0x00000011;
2054
2055					*(unsigned int*)destinationElement = a | r | g | b;
2056
2057					sourceElement += source.bytes;
2058					destinationElement += destination.bytes;
2059				}
2060
2061				sourceRow += source.pitchB;
2062				destinationRow += destination.pitchB;
2063			}
2064
2065			sourceSlice += source.sliceB;
2066			destinationSlice += destination.sliceB;
2067		}
2068	}
2069
2070	void Surface::decodeP8(Buffer &destination, const Buffer &source)
2071	{
2072		unsigned char *sourceSlice = (unsigned char*)source.buffer;
2073		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
2074
2075		for(int z = 0; z < destination.depth && z < source.depth; z++)
2076		{
2077			unsigned char *sourceRow = sourceSlice;
2078			unsigned char *destinationRow = destinationSlice;
2079
2080			for(int y = 0; y < destination.height && y < source.height; y++)
2081			{
2082				unsigned char *sourceElement = sourceRow;
2083				unsigned char *destinationElement = destinationRow;
2084
2085				for(int x = 0; x < destination.width && x < source.width; x++)
2086				{
2087					unsigned int abgr = palette[*(unsigned char*)sourceElement];
2088
2089					unsigned int r = (abgr & 0x000000FF) << 16;
2090					unsigned int g = (abgr & 0x0000FF00) << 0;
2091					unsigned int b = (abgr & 0x00FF0000) >> 16;
2092					unsigned int a = (abgr & 0xFF000000) >> 0;
2093
2094					*(unsigned int*)destinationElement = a | r | g | b;
2095
2096					sourceElement += source.bytes;
2097					destinationElement += destination.bytes;
2098				}
2099
2100				sourceRow += source.pitchB;
2101				destinationRow += destination.pitchB;
2102			}
2103
2104			sourceSlice += source.sliceB;
2105			destinationSlice += destination.sliceB;
2106		}
2107	}
2108
2109#if S3TC_SUPPORT
2110	void Surface::decodeDXT1(Buffer &internal, const Buffer &external)
2111	{
2112		unsigned int *destSlice = (unsigned int*)internal.buffer;
2113		const DXT1 *source = (const DXT1*)external.buffer;
2114
2115		for(int z = 0; z < external.depth; z++)
2116		{
2117			unsigned int *dest = destSlice;
2118
2119			for(int y = 0; y < external.height; y += 4)
2120			{
2121				for(int x = 0; x < external.width; x += 4)
2122				{
2123					Color<byte> c[4];
2124
2125					c[0] = source->c0;
2126					c[1] = source->c1;
2127
2128					if(source->c0 > source->c1)   // No transparency
2129					{
2130						// c2 = 2 / 3 * c0 + 1 / 3 * c1
2131						c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2132						c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2133						c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2134						c[2].a = 0xFF;
2135
2136						// c3 = 1 / 3 * c0 + 2 / 3 * c1
2137						c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2138						c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2139						c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2140						c[3].a = 0xFF;
2141					}
2142					else   // c3 transparent
2143					{
2144						// c2 = 1 / 2 * c0 + 1 / 2 * c1
2145						c[2].r = (byte)(((word)c[0].r + (word)c[1].r) / 2);
2146						c[2].g = (byte)(((word)c[0].g + (word)c[1].g) / 2);
2147						c[2].b = (byte)(((word)c[0].b + (word)c[1].b) / 2);
2148						c[2].a = 0xFF;
2149
2150						c[3].r = 0;
2151						c[3].g = 0;
2152						c[3].b = 0;
2153						c[3].a = 0;
2154					}
2155
2156					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2157					{
2158						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2159						{
2160							dest[(x + i) + (y + j) * internal.width] = c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4];
2161						}
2162					}
2163
2164					source++;
2165				}
2166			}
2167
2168			(byte*&)destSlice += internal.sliceB;
2169		}
2170	}
2171
2172	void Surface::decodeDXT3(Buffer &internal, const Buffer &external)
2173	{
2174		unsigned int *destSlice = (unsigned int*)internal.buffer;
2175		const DXT3 *source = (const DXT3*)external.buffer;
2176
2177		for(int z = 0; z < external.depth; z++)
2178		{
2179			unsigned int *dest = destSlice;
2180
2181			for(int y = 0; y < external.height; y += 4)
2182			{
2183				for(int x = 0; x < external.width; x += 4)
2184				{
2185					Color<byte> c[4];
2186
2187					c[0] = source->c0;
2188					c[1] = source->c1;
2189
2190					// c2 = 2 / 3 * c0 + 1 / 3 * c1
2191					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2192					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2193					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2194
2195					// c3 = 1 / 3 * c0 + 2 / 3 * c1
2196					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2197					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2198					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2199
2200					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2201					{
2202						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2203						{
2204							unsigned int a = (unsigned int)(source->a >> 4 * (i + j * 4)) & 0x0F;
2205							unsigned int color = (c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | ((a << 28) + (a << 24));
2206
2207							dest[(x + i) + (y + j) * internal.width] = color;
2208						}
2209					}
2210
2211					source++;
2212				}
2213			}
2214
2215			(byte*&)destSlice += internal.sliceB;
2216		}
2217	}
2218
2219	void Surface::decodeDXT5(Buffer &internal, const Buffer &external)
2220	{
2221		unsigned int *destSlice = (unsigned int*)internal.buffer;
2222		const DXT5 *source = (const DXT5*)external.buffer;
2223
2224		for(int z = 0; z < external.depth; z++)
2225		{
2226			unsigned int *dest = destSlice;
2227
2228			for(int y = 0; y < external.height; y += 4)
2229			{
2230				for(int x = 0; x < external.width; x += 4)
2231				{
2232					Color<byte> c[4];
2233
2234					c[0] = source->c0;
2235					c[1] = source->c1;
2236
2237					// c2 = 2 / 3 * c0 + 1 / 3 * c1
2238					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2239					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2240					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2241
2242					// c3 = 1 / 3 * c0 + 2 / 3 * c1
2243					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2244					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2245					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2246
2247					byte a[8];
2248
2249					a[0] = source->a0;
2250					a[1] = source->a1;
2251
2252					if(a[0] > a[1])
2253					{
2254						a[2] = (byte)((6 * (word)a[0] + 1 * (word)a[1] + 3) / 7);
2255						a[3] = (byte)((5 * (word)a[0] + 2 * (word)a[1] + 3) / 7);
2256						a[4] = (byte)((4 * (word)a[0] + 3 * (word)a[1] + 3) / 7);
2257						a[5] = (byte)((3 * (word)a[0] + 4 * (word)a[1] + 3) / 7);
2258						a[6] = (byte)((2 * (word)a[0] + 5 * (word)a[1] + 3) / 7);
2259						a[7] = (byte)((1 * (word)a[0] + 6 * (word)a[1] + 3) / 7);
2260					}
2261					else
2262					{
2263						a[2] = (byte)((4 * (word)a[0] + 1 * (word)a[1] + 2) / 5);
2264						a[3] = (byte)((3 * (word)a[0] + 2 * (word)a[1] + 2) / 5);
2265						a[4] = (byte)((2 * (word)a[0] + 3 * (word)a[1] + 2) / 5);
2266						a[5] = (byte)((1 * (word)a[0] + 4 * (word)a[1] + 2) / 5);
2267						a[6] = 0;
2268						a[7] = 0xFF;
2269					}
2270
2271					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2272					{
2273						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2274						{
2275							unsigned int alpha = (unsigned int)a[(unsigned int)(source->alut >> (16 + 3 * (i + j * 4))) % 8] << 24;
2276							unsigned int color = (c[(source->clut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | alpha;
2277
2278							dest[(x + i) + (y + j) * internal.width] = color;
2279						}
2280					}
2281
2282					source++;
2283				}
2284			}
2285
2286			(byte*&)destSlice += internal.sliceB;
2287		}
2288	}
2289#endif
2290
2291	void Surface::decodeATI1(Buffer &internal, const Buffer &external)
2292	{
2293		byte *destSlice = (byte*)internal.buffer;
2294		const ATI1 *source = (const ATI1*)external.buffer;
2295
2296		for(int z = 0; z < external.depth; z++)
2297		{
2298			byte *dest = destSlice;
2299
2300			for(int y = 0; y < external.height; y += 4)
2301			{
2302				for(int x = 0; x < external.width; x += 4)
2303				{
2304					byte r[8];
2305
2306					r[0] = source->r0;
2307					r[1] = source->r1;
2308
2309					if(r[0] > r[1])
2310					{
2311						r[2] = (byte)((6 * (word)r[0] + 1 * (word)r[1] + 3) / 7);
2312						r[3] = (byte)((5 * (word)r[0] + 2 * (word)r[1] + 3) / 7);
2313						r[4] = (byte)((4 * (word)r[0] + 3 * (word)r[1] + 3) / 7);
2314						r[5] = (byte)((3 * (word)r[0] + 4 * (word)r[1] + 3) / 7);
2315						r[6] = (byte)((2 * (word)r[0] + 5 * (word)r[1] + 3) / 7);
2316						r[7] = (byte)((1 * (word)r[0] + 6 * (word)r[1] + 3) / 7);
2317					}
2318					else
2319					{
2320						r[2] = (byte)((4 * (word)r[0] + 1 * (word)r[1] + 2) / 5);
2321						r[3] = (byte)((3 * (word)r[0] + 2 * (word)r[1] + 2) / 5);
2322						r[4] = (byte)((2 * (word)r[0] + 3 * (word)r[1] + 2) / 5);
2323						r[5] = (byte)((1 * (word)r[0] + 4 * (word)r[1] + 2) / 5);
2324						r[6] = 0;
2325						r[7] = 0xFF;
2326					}
2327
2328					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2329					{
2330						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2331						{
2332							dest[(x + i) + (y + j) * internal.width] = r[(unsigned int)(source->rlut >> (16 + 3 * (i + j * 4))) % 8];
2333						}
2334					}
2335
2336					source++;
2337				}
2338			}
2339
2340			destSlice += internal.sliceB;
2341		}
2342	}
2343
2344	void Surface::decodeATI2(Buffer &internal, const Buffer &external)
2345	{
2346		word *destSlice = (word*)internal.buffer;
2347		const ATI2 *source = (const ATI2*)external.buffer;
2348
2349		for(int z = 0; z < external.depth; z++)
2350		{
2351			word *dest = destSlice;
2352
2353			for(int y = 0; y < external.height; y += 4)
2354			{
2355				for(int x = 0; x < external.width; x += 4)
2356				{
2357					byte X[8];
2358
2359					X[0] = source->x0;
2360					X[1] = source->x1;
2361
2362					if(X[0] > X[1])
2363					{
2364						X[2] = (byte)((6 * (word)X[0] + 1 * (word)X[1] + 3) / 7);
2365						X[3] = (byte)((5 * (word)X[0] + 2 * (word)X[1] + 3) / 7);
2366						X[4] = (byte)((4 * (word)X[0] + 3 * (word)X[1] + 3) / 7);
2367						X[5] = (byte)((3 * (word)X[0] + 4 * (word)X[1] + 3) / 7);
2368						X[6] = (byte)((2 * (word)X[0] + 5 * (word)X[1] + 3) / 7);
2369						X[7] = (byte)((1 * (word)X[0] + 6 * (word)X[1] + 3) / 7);
2370					}
2371					else
2372					{
2373						X[2] = (byte)((4 * (word)X[0] + 1 * (word)X[1] + 2) / 5);
2374						X[3] = (byte)((3 * (word)X[0] + 2 * (word)X[1] + 2) / 5);
2375						X[4] = (byte)((2 * (word)X[0] + 3 * (word)X[1] + 2) / 5);
2376						X[5] = (byte)((1 * (word)X[0] + 4 * (word)X[1] + 2) / 5);
2377						X[6] = 0;
2378						X[7] = 0xFF;
2379					}
2380
2381					byte Y[8];
2382
2383					Y[0] = source->y0;
2384					Y[1] = source->y1;
2385
2386					if(Y[0] > Y[1])
2387					{
2388						Y[2] = (byte)((6 * (word)Y[0] + 1 * (word)Y[1] + 3) / 7);
2389						Y[3] = (byte)((5 * (word)Y[0] + 2 * (word)Y[1] + 3) / 7);
2390						Y[4] = (byte)((4 * (word)Y[0] + 3 * (word)Y[1] + 3) / 7);
2391						Y[5] = (byte)((3 * (word)Y[0] + 4 * (word)Y[1] + 3) / 7);
2392						Y[6] = (byte)((2 * (word)Y[0] + 5 * (word)Y[1] + 3) / 7);
2393						Y[7] = (byte)((1 * (word)Y[0] + 6 * (word)Y[1] + 3) / 7);
2394					}
2395					else
2396					{
2397						Y[2] = (byte)((4 * (word)Y[0] + 1 * (word)Y[1] + 2) / 5);
2398						Y[3] = (byte)((3 * (word)Y[0] + 2 * (word)Y[1] + 2) / 5);
2399						Y[4] = (byte)((2 * (word)Y[0] + 3 * (word)Y[1] + 2) / 5);
2400						Y[5] = (byte)((1 * (word)Y[0] + 4 * (word)Y[1] + 2) / 5);
2401						Y[6] = 0;
2402						Y[7] = 0xFF;
2403					}
2404
2405					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2406					{
2407						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2408						{
2409							word r = X[(unsigned int)(source->xlut >> (16 + 3 * (i + j * 4))) % 8];
2410							word g = Y[(unsigned int)(source->ylut >> (16 + 3 * (i + j * 4))) % 8];
2411
2412							dest[(x + i) + (y + j) * internal.width] = (g << 8) + r;
2413						}
2414					}
2415
2416					source++;
2417				}
2418			}
2419
2420			(byte*&)destSlice += internal.sliceB;
2421		}
2422	}
2423
2424	void Surface::decodeETC2(Buffer &internal, const Buffer &external, int nbAlphaBits, bool isSRGB)
2425	{
2426		ETC_Decoder::Decode((const byte*)external.buffer, (byte*)internal.buffer, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
2427		                    (nbAlphaBits == 8) ? ETC_Decoder::ETC_RGBA : ((nbAlphaBits == 1) ? ETC_Decoder::ETC_RGB_PUNCHTHROUGH_ALPHA : ETC_Decoder::ETC_RGB));
2428
2429		if(isSRGB)
2430		{
2431			static byte sRGBtoLinearTable[256];
2432			static bool sRGBtoLinearTableDirty = true;
2433			if(sRGBtoLinearTableDirty)
2434			{
2435				for(int i = 0; i < 256; i++)
2436				{
2437					sRGBtoLinearTable[i] = static_cast<byte>(sRGBtoLinear(static_cast<float>(i) / 255.0f) * 255.0f + 0.5f);
2438				}
2439				sRGBtoLinearTableDirty = false;
2440			}
2441
2442			// Perform sRGB conversion in place after decoding
2443			byte* src = (byte*)internal.buffer;
2444			for(int y = 0; y < internal.height; y++)
2445			{
2446				byte* srcRow = src + y * internal.pitchB;
2447				for(int x = 0; x <  internal.width; x++)
2448				{
2449					byte* srcPix = srcRow + x * internal.bytes;
2450					for(int i = 0; i < 3; i++)
2451					{
2452						srcPix[i] = sRGBtoLinearTable[srcPix[i]];
2453					}
2454				}
2455			}
2456		}
2457	}
2458
2459	void Surface::decodeEAC(Buffer &internal, const Buffer &external, int nbChannels, bool isSigned)
2460	{
2461		ASSERT(nbChannels == 1 || nbChannels == 2);
2462
2463		ETC_Decoder::Decode((const byte*)external.buffer, (byte*)internal.buffer, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
2464		                    (nbChannels == 1) ? (isSigned ? ETC_Decoder::ETC_R_SIGNED : ETC_Decoder::ETC_R_UNSIGNED) : (isSigned ? ETC_Decoder::ETC_RG_SIGNED : ETC_Decoder::ETC_RG_UNSIGNED));
2465
2466		// FIXME: We convert signed data to float, until signed integer internal formats are supported
2467		//        This code can be removed if signed ETC2 images are decoded to internal 8 bit signed R/RG formats
2468		if(isSigned)
2469		{
2470			sbyte* src = (sbyte*)internal.buffer;
2471
2472			for(int y = 0; y < internal.height; y++)
2473			{
2474				sbyte* srcRow = src + y * internal.pitchB;
2475				for(int x = internal.width - 1; x >= 0; x--)
2476				{
2477					int dx = x & 0xFFFFFFFC;
2478					int mx = x - dx;
2479					sbyte* srcPix = srcRow + dx * internal.bytes + mx * nbChannels;
2480					float* dstPix = (float*)(srcRow + x * internal.bytes);
2481					for(int c = nbChannels - 1; c >= 0; c--)
2482					{
2483						static const float normalization = 1.0f / 127.875f;
2484						dstPix[c] = clamp(static_cast<float>(srcPix[c]) * normalization, -1.0f, 1.0f);
2485					}
2486				}
2487			}
2488		}
2489	}
2490
2491	void Surface::decodeASTC(Buffer &internal, const Buffer &external, int xBlockSize, int yBlockSize, int zBlockSize, bool isSRGB)
2492	{
2493	}
2494
2495	unsigned int Surface::size(int width, int height, int depth, Format format)
2496	{
2497		// Dimensions rounded up to multiples of 4, used for compressed formats
2498		int width4 = align(width, 4);
2499		int height4 = align(height, 4);
2500
2501		switch(format)
2502		{
2503		#if S3TC_SUPPORT
2504		case FORMAT_DXT1:
2505		#endif
2506		case FORMAT_ATI1:
2507		case FORMAT_ETC1:
2508		case FORMAT_R11_EAC:
2509		case FORMAT_SIGNED_R11_EAC:
2510		case FORMAT_RGB8_ETC2:
2511		case FORMAT_SRGB8_ETC2:
2512		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2513		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2514			return width4 * height4 * depth / 2;
2515		#if S3TC_SUPPORT
2516		case FORMAT_DXT3:
2517		case FORMAT_DXT5:
2518		#endif
2519		case FORMAT_ATI2:
2520		case FORMAT_RG11_EAC:
2521		case FORMAT_SIGNED_RG11_EAC:
2522		case FORMAT_RGBA8_ETC2_EAC:
2523		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
2524		case FORMAT_RGBA_ASTC_4x4_KHR:
2525		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
2526			return width4 * height4 * depth;
2527		case FORMAT_RGBA_ASTC_5x4_KHR:
2528		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
2529			return align(width, 5) * height4 * depth;
2530		case FORMAT_RGBA_ASTC_5x5_KHR:
2531		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
2532			return align(width, 5) * align(height, 5) * depth;
2533		case FORMAT_RGBA_ASTC_6x5_KHR:
2534		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
2535			return align(width, 6) * align(height, 5) * depth;
2536		case FORMAT_RGBA_ASTC_6x6_KHR:
2537		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
2538			return align(width, 6) * align(height, 6) * depth;
2539		case FORMAT_RGBA_ASTC_8x5_KHR:
2540		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
2541			return align(width, 8) * align(height, 5) * depth;
2542		case FORMAT_RGBA_ASTC_8x6_KHR:
2543		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
2544			return align(width, 8) * align(height, 6) * depth;
2545		case FORMAT_RGBA_ASTC_8x8_KHR:
2546		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
2547			return align(width, 8) * align(height, 8) * depth;
2548		case FORMAT_RGBA_ASTC_10x5_KHR:
2549		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
2550			return align(width, 10) * align(height, 5) * depth;
2551		case FORMAT_RGBA_ASTC_10x6_KHR:
2552		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
2553			return align(width, 10) * align(height, 6) * depth;
2554		case FORMAT_RGBA_ASTC_10x8_KHR:
2555		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
2556			return align(width, 10) * align(height, 8) * depth;
2557		case FORMAT_RGBA_ASTC_10x10_KHR:
2558		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
2559			return align(width, 10) * align(height, 10) * depth;
2560		case FORMAT_RGBA_ASTC_12x10_KHR:
2561		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
2562			return align(width, 12) * align(height, 10) * depth;
2563		case FORMAT_RGBA_ASTC_12x12_KHR:
2564		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
2565			return align(width, 12) * align(height, 12) * depth;
2566		case FORMAT_YV12_BT601:
2567		case FORMAT_YV12_BT709:
2568		case FORMAT_YV12_JFIF:
2569			{
2570				unsigned int YStride = align(width, 16);
2571				unsigned int YSize = YStride * height;
2572				unsigned int CStride = align(YStride / 2, 16);
2573				unsigned int CSize = CStride * height / 2;
2574
2575				return YSize + 2 * CSize;
2576			}
2577		default:
2578			return bytes(format) * width * height * depth;
2579		}
2580
2581		return 0;
2582	}
2583
2584	bool Surface::isStencil(Format format)
2585	{
2586		switch(format)
2587		{
2588		case FORMAT_D32:
2589		case FORMAT_D16:
2590		case FORMAT_D24X8:
2591		case FORMAT_D32F:
2592		case FORMAT_D32F_COMPLEMENTARY:
2593		case FORMAT_D32F_LOCKABLE:
2594			return false;
2595		case FORMAT_D24S8:
2596		case FORMAT_D24FS8:
2597		case FORMAT_S8:
2598		case FORMAT_DF24S8:
2599		case FORMAT_DF16S8:
2600		case FORMAT_D32FS8_TEXTURE:
2601		case FORMAT_D32FS8_SHADOW:
2602		case FORMAT_INTZ:
2603			return true;
2604		default:
2605			return false;
2606		}
2607	}
2608
2609	bool Surface::isDepth(Format format)
2610	{
2611		switch(format)
2612		{
2613		case FORMAT_D32:
2614		case FORMAT_D16:
2615		case FORMAT_D24X8:
2616		case FORMAT_D24S8:
2617		case FORMAT_D24FS8:
2618		case FORMAT_D32F:
2619		case FORMAT_D32F_COMPLEMENTARY:
2620		case FORMAT_D32F_LOCKABLE:
2621		case FORMAT_DF24S8:
2622		case FORMAT_DF16S8:
2623		case FORMAT_D32FS8_TEXTURE:
2624		case FORMAT_D32FS8_SHADOW:
2625		case FORMAT_INTZ:
2626			return true;
2627		case FORMAT_S8:
2628			return false;
2629		default:
2630			return false;
2631		}
2632	}
2633
2634	bool Surface::isPalette(Format format)
2635	{
2636		switch(format)
2637		{
2638		case FORMAT_P8:
2639		case FORMAT_A8P8:
2640			return true;
2641		default:
2642			return false;
2643		}
2644	}
2645
2646	bool Surface::isFloatFormat(Format format)
2647	{
2648		switch(format)
2649		{
2650		case FORMAT_R5G6B5:
2651		case FORMAT_R8G8B8:
2652		case FORMAT_B8G8R8:
2653		case FORMAT_X8R8G8B8:
2654		case FORMAT_X8B8G8R8I:
2655		case FORMAT_X8B8G8R8:
2656		case FORMAT_A8R8G8B8:
2657		case FORMAT_SRGB8_X8:
2658		case FORMAT_SRGB8_A8:
2659		case FORMAT_A8B8G8R8I:
2660		case FORMAT_R8UI:
2661		case FORMAT_G8R8UI:
2662		case FORMAT_X8B8G8R8UI:
2663		case FORMAT_A8B8G8R8UI:
2664		case FORMAT_A8B8G8R8:
2665		case FORMAT_G8R8I:
2666		case FORMAT_G8R8:
2667		case FORMAT_A2B10G10R10:
2668		case FORMAT_R8I_SNORM:
2669		case FORMAT_G8R8I_SNORM:
2670		case FORMAT_X8B8G8R8I_SNORM:
2671		case FORMAT_A8B8G8R8I_SNORM:
2672		case FORMAT_R16I:
2673		case FORMAT_R16UI:
2674		case FORMAT_G16R16I:
2675		case FORMAT_G16R16UI:
2676		case FORMAT_G16R16:
2677		case FORMAT_X16B16G16R16I:
2678		case FORMAT_X16B16G16R16UI:
2679		case FORMAT_A16B16G16R16I:
2680		case FORMAT_A16B16G16R16UI:
2681		case FORMAT_A16B16G16R16:
2682		case FORMAT_V8U8:
2683		case FORMAT_Q8W8V8U8:
2684		case FORMAT_X8L8V8U8:
2685		case FORMAT_V16U16:
2686		case FORMAT_A16W16V16U16:
2687		case FORMAT_Q16W16V16U16:
2688		case FORMAT_A8:
2689		case FORMAT_R8I:
2690		case FORMAT_R8:
2691		case FORMAT_L8:
2692		case FORMAT_L16:
2693		case FORMAT_A8L8:
2694		case FORMAT_YV12_BT601:
2695		case FORMAT_YV12_BT709:
2696		case FORMAT_YV12_JFIF:
2697		case FORMAT_R32I:
2698		case FORMAT_R32UI:
2699		case FORMAT_G32R32I:
2700		case FORMAT_G32R32UI:
2701		case FORMAT_X32B32G32R32I:
2702		case FORMAT_X32B32G32R32UI:
2703		case FORMAT_A32B32G32R32I:
2704		case FORMAT_A32B32G32R32UI:
2705			return false;
2706		case FORMAT_R32F:
2707		case FORMAT_G32R32F:
2708		case FORMAT_X32B32G32R32F:
2709		case FORMAT_A32B32G32R32F:
2710		case FORMAT_D32F:
2711		case FORMAT_D32F_COMPLEMENTARY:
2712		case FORMAT_D32F_LOCKABLE:
2713		case FORMAT_D32FS8_TEXTURE:
2714		case FORMAT_D32FS8_SHADOW:
2715		case FORMAT_L16F:
2716		case FORMAT_A16L16F:
2717		case FORMAT_L32F:
2718		case FORMAT_A32L32F:
2719			return true;
2720		default:
2721			ASSERT(false);
2722		}
2723
2724		return false;
2725	}
2726
2727	bool Surface::isUnsignedComponent(Format format, int component)
2728	{
2729		switch(format)
2730		{
2731		case FORMAT_NULL:
2732		case FORMAT_R5G6B5:
2733		case FORMAT_R8G8B8:
2734		case FORMAT_B8G8R8:
2735		case FORMAT_X8R8G8B8:
2736		case FORMAT_X8B8G8R8:
2737		case FORMAT_A8R8G8B8:
2738		case FORMAT_A8B8G8R8:
2739		case FORMAT_SRGB8_X8:
2740		case FORMAT_SRGB8_A8:
2741		case FORMAT_G8R8:
2742		case FORMAT_A2B10G10R10:
2743		case FORMAT_R16UI:
2744		case FORMAT_G16R16:
2745		case FORMAT_G16R16UI:
2746		case FORMAT_X16B16G16R16UI:
2747		case FORMAT_A16B16G16R16:
2748		case FORMAT_A16B16G16R16UI:
2749		case FORMAT_R32UI:
2750		case FORMAT_G32R32UI:
2751		case FORMAT_X32B32G32R32UI:
2752		case FORMAT_A32B32G32R32UI:
2753		case FORMAT_R8UI:
2754		case FORMAT_G8R8UI:
2755		case FORMAT_X8B8G8R8UI:
2756		case FORMAT_A8B8G8R8UI:
2757		case FORMAT_D32F:
2758		case FORMAT_D32F_COMPLEMENTARY:
2759		case FORMAT_D32F_LOCKABLE:
2760		case FORMAT_D32FS8_TEXTURE:
2761		case FORMAT_D32FS8_SHADOW:
2762		case FORMAT_A8:
2763		case FORMAT_R8:
2764		case FORMAT_L8:
2765		case FORMAT_L16:
2766		case FORMAT_A8L8:
2767		case FORMAT_YV12_BT601:
2768		case FORMAT_YV12_BT709:
2769		case FORMAT_YV12_JFIF:
2770			return true;
2771		case FORMAT_A8B8G8R8I:
2772		case FORMAT_A16B16G16R16I:
2773		case FORMAT_A32B32G32R32I:
2774		case FORMAT_A8B8G8R8I_SNORM:
2775		case FORMAT_Q8W8V8U8:
2776		case FORMAT_Q16W16V16U16:
2777		case FORMAT_A32B32G32R32F:
2778			return false;
2779		case FORMAT_R32F:
2780		case FORMAT_R8I:
2781		case FORMAT_R16I:
2782		case FORMAT_R32I:
2783		case FORMAT_R8I_SNORM:
2784			return component >= 1;
2785		case FORMAT_V8U8:
2786		case FORMAT_X8L8V8U8:
2787		case FORMAT_V16U16:
2788		case FORMAT_G32R32F:
2789		case FORMAT_G8R8I:
2790		case FORMAT_G16R16I:
2791		case FORMAT_G32R32I:
2792		case FORMAT_G8R8I_SNORM:
2793			return component >= 2;
2794		case FORMAT_A16W16V16U16:
2795		case FORMAT_X32B32G32R32F:
2796		case FORMAT_X8B8G8R8I:
2797		case FORMAT_X16B16G16R16I:
2798		case FORMAT_X32B32G32R32I:
2799		case FORMAT_X8B8G8R8I_SNORM:
2800			return component >= 3;
2801		default:
2802			ASSERT(false);
2803		}
2804
2805		return false;
2806	}
2807
2808	bool Surface::isSRGBreadable(Format format)
2809	{
2810		// Keep in sync with Capabilities::isSRGBreadable
2811		switch(format)
2812		{
2813		case FORMAT_L8:
2814		case FORMAT_A8L8:
2815		case FORMAT_R8G8B8:
2816		case FORMAT_A8R8G8B8:
2817		case FORMAT_X8R8G8B8:
2818		case FORMAT_A8B8G8R8:
2819		case FORMAT_X8B8G8R8:
2820		case FORMAT_SRGB8_X8:
2821		case FORMAT_SRGB8_A8:
2822		case FORMAT_R5G6B5:
2823		case FORMAT_X1R5G5B5:
2824		case FORMAT_A1R5G5B5:
2825		case FORMAT_A4R4G4B4:
2826		#if S3TC_SUPPORT
2827		case FORMAT_DXT1:
2828		case FORMAT_DXT3:
2829		case FORMAT_DXT5:
2830		#endif
2831		case FORMAT_ATI1:
2832		case FORMAT_ATI2:
2833			return true;
2834		default:
2835			return false;
2836		}
2837
2838		return false;
2839	}
2840
2841	bool Surface::isSRGBwritable(Format format)
2842	{
2843		// Keep in sync with Capabilities::isSRGBwritable
2844		switch(format)
2845		{
2846		case FORMAT_NULL:
2847		case FORMAT_A8R8G8B8:
2848		case FORMAT_X8R8G8B8:
2849		case FORMAT_A8B8G8R8:
2850		case FORMAT_X8B8G8R8:
2851		case FORMAT_SRGB8_X8:
2852		case FORMAT_SRGB8_A8:
2853		case FORMAT_R5G6B5:
2854			return true;
2855		default:
2856			return false;
2857		}
2858	}
2859
2860	bool Surface::isCompressed(Format format)
2861	{
2862		switch(format)
2863		{
2864		#if S3TC_SUPPORT
2865		case FORMAT_DXT1:
2866		case FORMAT_DXT3:
2867		case FORMAT_DXT5:
2868		#endif
2869		case FORMAT_ATI1:
2870		case FORMAT_ATI2:
2871		case FORMAT_ETC1:
2872		case FORMAT_R11_EAC:
2873		case FORMAT_SIGNED_R11_EAC:
2874		case FORMAT_RG11_EAC:
2875		case FORMAT_SIGNED_RG11_EAC:
2876		case FORMAT_RGB8_ETC2:
2877		case FORMAT_SRGB8_ETC2:
2878		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2879		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2880		case FORMAT_RGBA8_ETC2_EAC:
2881		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
2882		case FORMAT_RGBA_ASTC_4x4_KHR:
2883		case FORMAT_RGBA_ASTC_5x4_KHR:
2884		case FORMAT_RGBA_ASTC_5x5_KHR:
2885		case FORMAT_RGBA_ASTC_6x5_KHR:
2886		case FORMAT_RGBA_ASTC_6x6_KHR:
2887		case FORMAT_RGBA_ASTC_8x5_KHR:
2888		case FORMAT_RGBA_ASTC_8x6_KHR:
2889		case FORMAT_RGBA_ASTC_8x8_KHR:
2890		case FORMAT_RGBA_ASTC_10x5_KHR:
2891		case FORMAT_RGBA_ASTC_10x6_KHR:
2892		case FORMAT_RGBA_ASTC_10x8_KHR:
2893		case FORMAT_RGBA_ASTC_10x10_KHR:
2894		case FORMAT_RGBA_ASTC_12x10_KHR:
2895		case FORMAT_RGBA_ASTC_12x12_KHR:
2896		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
2897		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
2898		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
2899		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
2900		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
2901		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
2902		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
2903		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
2904		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
2905		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
2906		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
2907		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
2908		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
2909		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
2910			return true;
2911		default:
2912			return false;
2913		}
2914	}
2915
2916	bool Surface::isNonNormalizedInteger(Format format)
2917	{
2918		switch(format)
2919		{
2920		case FORMAT_A8B8G8R8I:
2921		case FORMAT_X8B8G8R8I:
2922		case FORMAT_G8R8I:
2923		case FORMAT_R8I:
2924		case FORMAT_A8B8G8R8UI:
2925		case FORMAT_X8B8G8R8UI:
2926		case FORMAT_G8R8UI:
2927		case FORMAT_R8UI:
2928		case FORMAT_A16B16G16R16I:
2929		case FORMAT_X16B16G16R16I:
2930		case FORMAT_G16R16I:
2931		case FORMAT_R16I:
2932		case FORMAT_A16B16G16R16UI:
2933		case FORMAT_X16B16G16R16UI:
2934		case FORMAT_G16R16UI:
2935		case FORMAT_R16UI:
2936		case FORMAT_A32B32G32R32I:
2937		case FORMAT_X32B32G32R32I:
2938		case FORMAT_G32R32I:
2939		case FORMAT_R32I:
2940		case FORMAT_A32B32G32R32UI:
2941		case FORMAT_X32B32G32R32UI:
2942		case FORMAT_G32R32UI:
2943		case FORMAT_R32UI:
2944			return true;
2945		default:
2946			return false;
2947		}
2948	}
2949
2950	int Surface::componentCount(Format format)
2951	{
2952		switch(format)
2953		{
2954		case FORMAT_R5G6B5:         return 3;
2955		case FORMAT_X8R8G8B8:       return 3;
2956		case FORMAT_X8B8G8R8I:      return 3;
2957		case FORMAT_X8B8G8R8:       return 3;
2958		case FORMAT_A8R8G8B8:       return 4;
2959		case FORMAT_SRGB8_X8:       return 3;
2960		case FORMAT_SRGB8_A8:       return 4;
2961		case FORMAT_A8B8G8R8I:      return 4;
2962		case FORMAT_A8B8G8R8:       return 4;
2963		case FORMAT_G8R8I:          return 2;
2964		case FORMAT_G8R8:           return 2;
2965		case FORMAT_R8I_SNORM:      return 1;
2966		case FORMAT_G8R8I_SNORM:    return 2;
2967		case FORMAT_X8B8G8R8I_SNORM:return 3;
2968		case FORMAT_A8B8G8R8I_SNORM:return 4;
2969		case FORMAT_R8UI:           return 1;
2970		case FORMAT_G8R8UI:         return 2;
2971		case FORMAT_X8B8G8R8UI:     return 3;
2972		case FORMAT_A8B8G8R8UI:     return 4;
2973		case FORMAT_A2B10G10R10:    return 4;
2974		case FORMAT_G16R16I:        return 2;
2975		case FORMAT_G16R16UI:       return 2;
2976		case FORMAT_G16R16:         return 2;
2977		case FORMAT_G32R32I:        return 2;
2978		case FORMAT_G32R32UI:       return 2;
2979		case FORMAT_X16B16G16R16I:  return 3;
2980		case FORMAT_X16B16G16R16UI: return 3;
2981		case FORMAT_A16B16G16R16I:  return 4;
2982		case FORMAT_A16B16G16R16UI: return 4;
2983		case FORMAT_A16B16G16R16:   return 4;
2984		case FORMAT_X32B32G32R32I:  return 3;
2985		case FORMAT_X32B32G32R32UI: return 3;
2986		case FORMAT_A32B32G32R32I:  return 4;
2987		case FORMAT_A32B32G32R32UI: return 4;
2988		case FORMAT_V8U8:           return 2;
2989		case FORMAT_Q8W8V8U8:       return 4;
2990		case FORMAT_X8L8V8U8:       return 3;
2991		case FORMAT_V16U16:         return 2;
2992		case FORMAT_A16W16V16U16:   return 4;
2993		case FORMAT_Q16W16V16U16:   return 4;
2994		case FORMAT_R32F:           return 1;
2995		case FORMAT_G32R32F:        return 2;
2996		case FORMAT_X32B32G32R32F:  return 3;
2997		case FORMAT_A32B32G32R32F:  return 4;
2998		case FORMAT_D32F:           return 1;
2999		case FORMAT_D32F_LOCKABLE:  return 1;
3000		case FORMAT_D32FS8_TEXTURE: return 1;
3001		case FORMAT_D32FS8_SHADOW:  return 1;
3002		case FORMAT_A8:             return 1;
3003		case FORMAT_R8I:            return 1;
3004		case FORMAT_R8:             return 1;
3005		case FORMAT_R16I:           return 1;
3006		case FORMAT_R16UI:          return 1;
3007		case FORMAT_R32I:           return 1;
3008		case FORMAT_R32UI:          return 1;
3009		case FORMAT_L8:             return 1;
3010		case FORMAT_L16:            return 1;
3011		case FORMAT_A8L8:           return 2;
3012		case FORMAT_YV12_BT601:     return 3;
3013		case FORMAT_YV12_BT709:     return 3;
3014		case FORMAT_YV12_JFIF:      return 3;
3015		default:
3016			ASSERT(false);
3017		}
3018
3019		return 1;
3020	}
3021
3022	void *Surface::allocateBuffer(int width, int height, int depth, Format format)
3023	{
3024		// Render targets require 2x2 quads
3025		int width2 = (width + 1) & ~1;
3026		int height2 = (height + 1) & ~1;
3027
3028		// FIXME: Unpacking byte4 to short4 in the sampler currently involves reading 8 bytes,
3029		// so we have to allocate 4 extra bytes to avoid buffer overruns.
3030		return allocateZero(size(width2, height2, depth, format) + 4);
3031	}
3032
3033	void Surface::memfill4(void *buffer, int pattern, int bytes)
3034	{
3035		while((size_t)buffer & 0x1 && bytes >= 1)
3036		{
3037			*(char*)buffer = (char)pattern;
3038			(char*&)buffer += 1;
3039			bytes -= 1;
3040		}
3041
3042		while((size_t)buffer & 0x3 && bytes >= 2)
3043		{
3044			*(short*)buffer = (short)pattern;
3045			(short*&)buffer += 1;
3046			bytes -= 2;
3047		}
3048
3049		if(CPUID::supportsSSE())
3050		{
3051			while((size_t)buffer & 0xF && bytes >= 4)
3052			{
3053				*(int*)buffer = pattern;
3054				(int*&)buffer += 1;
3055				bytes -= 4;
3056			}
3057
3058			__m128 quad = _mm_set_ps1((float&)pattern);
3059
3060			float *pointer = (float*)buffer;
3061			int qxwords = bytes / 64;
3062			bytes -= qxwords * 64;
3063
3064			while(qxwords--)
3065			{
3066				_mm_stream_ps(pointer + 0, quad);
3067				_mm_stream_ps(pointer + 4, quad);
3068				_mm_stream_ps(pointer + 8, quad);
3069				_mm_stream_ps(pointer + 12, quad);
3070
3071				pointer += 16;
3072			}
3073
3074			buffer = pointer;
3075		}
3076
3077		while(bytes >= 4)
3078		{
3079			*(int*)buffer = (int)pattern;
3080			(int*&)buffer += 1;
3081			bytes -= 4;
3082		}
3083
3084		while(bytes >= 2)
3085		{
3086			*(short*)buffer = (short)pattern;
3087			(short*&)buffer += 1;
3088			bytes -= 2;
3089		}
3090
3091		while(bytes >= 1)
3092		{
3093			*(char*)buffer = (char)pattern;
3094			(char*&)buffer += 1;
3095			bytes -= 1;
3096		}
3097	}
3098
3099	bool Surface::isEntire(const SliceRect& rect) const
3100	{
3101		return (rect.x0 == 0 && rect.y0 == 0 && rect.x1 == internal.width && rect.y1 == internal.height && internal.depth == 1);
3102	}
3103
3104	SliceRect Surface::getRect() const
3105	{
3106		return SliceRect(0, 0, internal.width, internal.height, 0);
3107	}
3108
3109	void Surface::clearDepth(float depth, int x0, int y0, int width, int height)
3110	{
3111		if(width == 0 || height == 0) return;
3112
3113		// Not overlapping
3114		if(x0 > internal.width) return;
3115		if(y0 > internal.height) return;
3116		if(x0 + width < 0) return;
3117		if(y0 + height < 0) return;
3118
3119		// Clip against dimensions
3120		if(x0 < 0) {width += x0; x0 = 0;}
3121		if(x0 + width > internal.width) width = internal.width - x0;
3122		if(y0 < 0) {height += y0; y0 = 0;}
3123		if(y0 + height > internal.height) height = internal.height - y0;
3124
3125		const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
3126		const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
3127
3128		int width2 = (internal.width + 1) & ~1;
3129
3130		int x1 = x0 + width;
3131		int y1 = y0 + height;
3132
3133		if(internal.format == FORMAT_D32F_LOCKABLE ||
3134		   internal.format == FORMAT_D32FS8_TEXTURE ||
3135		   internal.format == FORMAT_D32FS8_SHADOW)
3136		{
3137			float *target = (float*)lockInternal(0, 0, 0, lock, PUBLIC) + x0 + width2 * y0;
3138
3139			for(int z = 0; z < internal.depth; z++)
3140			{
3141				for(int y = y0; y < y1; y++)
3142				{
3143					memfill4(target, (int&)depth, 4 * width);
3144					target += width2;
3145				}
3146			}
3147
3148			unlockInternal();
3149		}
3150		else   // Quad layout
3151		{
3152			if(complementaryDepthBuffer)
3153			{
3154				depth = 1 - depth;
3155			}
3156
3157			float *buffer = (float*)lockInternal(0, 0, 0, lock, PUBLIC);
3158
3159			int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
3160			int oddX1 = (x1 & ~1) * 2;
3161			int evenX0 = ((x0 + 1) & ~1) * 2;
3162			int evenBytes = (oddX1 - evenX0) * sizeof(float);
3163
3164			for(int z = 0; z < internal.depth; z++)
3165			{
3166				for(int y = y0; y < y1; y++)
3167				{
3168					float *target = buffer + (y & ~1) * width2 + (y & 1) * 2;
3169
3170					if((y & 1) == 0 && y + 1 < y1)   // Fill quad line at once
3171					{
3172						if((x0 & 1) != 0)
3173						{
3174							target[oddX0 + 0] = depth;
3175							target[oddX0 + 2] = depth;
3176						}
3177
3178					//	for(int x2 = evenX0; x2 < x1 * 2; x2 += 4)
3179					//	{
3180					//		target[x2 + 0] = depth;
3181					//		target[x2 + 1] = depth;
3182					//		target[x2 + 2] = depth;
3183					//		target[x2 + 3] = depth;
3184					//	}
3185
3186					//	__asm
3187					//	{
3188					//		movss xmm0, depth
3189					//		shufps xmm0, xmm0, 0x00
3190					//
3191					//		mov eax, x0
3192					//		add eax, 1
3193					//		and eax, 0xFFFFFFFE
3194					//		cmp eax, x1
3195					//		jge qEnd
3196					//
3197					//		mov edi, target
3198					//
3199					//	qLoop:
3200					//		movntps [edi+8*eax], xmm0
3201					//
3202					//		add eax, 2
3203					//		cmp eax, x1
3204					//		jl qLoop
3205					//	qEnd:
3206					//	}
3207
3208						memfill4(&target[evenX0], (int&)depth, evenBytes);
3209
3210						if((x1 & 1) != 0)
3211						{
3212							target[oddX1 + 0] = depth;
3213							target[oddX1 + 2] = depth;
3214						}
3215
3216						y++;
3217					}
3218					else
3219					{
3220						for(int x = x0, i = oddX0; x < x1; x++, i = (x & ~1) * 2 + (x & 1))
3221						{
3222							target[i] = depth;
3223						}
3224					}
3225				}
3226
3227				buffer += internal.sliceP;
3228			}
3229
3230			unlockInternal();
3231		}
3232	}
3233
3234	void Surface::clearStencil(unsigned char s, unsigned char mask, int x0, int y0, int width, int height)
3235	{
3236		if(mask == 0 || width == 0 || height == 0) return;
3237
3238		// Not overlapping
3239		if(x0 > internal.width) return;
3240		if(y0 > internal.height) return;
3241		if(x0 + width < 0) return;
3242		if(y0 + height < 0) return;
3243
3244		// Clip against dimensions
3245		if(x0 < 0) {width += x0; x0 = 0;}
3246		if(x0 + width > internal.width) width = internal.width - x0;
3247		if(y0 < 0) {height += y0; y0 = 0;}
3248		if(y0 + height > internal.height) height = internal.height - y0;
3249
3250		int width2 = (internal.width + 1) & ~1;
3251
3252		int x1 = x0 + width;
3253		int y1 = y0 + height;
3254
3255		int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
3256		int oddX1 = (x1 & ~1) * 2;
3257		int evenX0 = ((x0 + 1) & ~1) * 2;
3258		int evenBytes = oddX1 - evenX0;
3259
3260		unsigned char maskedS = s & mask;
3261		unsigned char invMask = ~mask;
3262		unsigned int fill = maskedS;
3263		fill = fill | (fill << 8) | (fill << 16) + (fill << 24);
3264
3265		char *buffer = (char*)lockStencil(0, PUBLIC);
3266
3267		// Stencil buffers are assumed to use quad layout
3268		for(int z = 0; z < stencil.depth; z++)
3269		{
3270			for(int y = y0; y < y1; y++)
3271			{
3272				char *target = buffer + (y & ~1) * width2 + (y & 1) * 2;
3273
3274				if((y & 1) == 0 && y + 1 < y1 && mask == 0xFF)   // Fill quad line at once
3275				{
3276					if((x0 & 1) != 0)
3277					{
3278						target[oddX0 + 0] = fill;
3279						target[oddX0 + 2] = fill;
3280					}
3281
3282					memfill4(&target[evenX0], fill, evenBytes);
3283
3284					if((x1 & 1) != 0)
3285					{
3286						target[oddX1 + 0] = fill;
3287						target[oddX1 + 2] = fill;
3288					}
3289
3290					y++;
3291				}
3292				else
3293				{
3294					for(int x = x0, i = oddX0; x < x1; x++, i = (x & ~1) * 2 + (x & 1))
3295					{
3296						target[i] = maskedS | (target[i] & invMask);
3297					}
3298				}
3299			}
3300
3301			buffer += stencil.sliceP;
3302		}
3303
3304		unlockStencil();
3305	}
3306
3307	void Surface::fill(const Color<float> &color, int x0, int y0, int width, int height)
3308	{
3309		unsigned char *row;
3310		Buffer *buffer;
3311
3312		if(internal.dirty)
3313		{
3314			row = (unsigned char*)lockInternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3315			buffer = &internal;
3316		}
3317		else
3318		{
3319			row = (unsigned char*)lockExternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3320			buffer = &external;
3321		}
3322
3323		if(buffer->bytes <= 4)
3324		{
3325			int c;
3326			buffer->write(&c, color);
3327
3328			if(buffer->bytes <= 1) c = (c << 8)  | c;
3329			if(buffer->bytes <= 2) c = (c << 16) | c;
3330
3331			for(int y = 0; y < height; y++)
3332			{
3333				memfill4(row, c, width * buffer->bytes);
3334
3335				row += buffer->pitchB;
3336			}
3337		}
3338		else   // Generic
3339		{
3340			for(int y = 0; y < height; y++)
3341			{
3342				unsigned char *element = row;
3343
3344				for(int x = 0; x < width; x++)
3345				{
3346					buffer->write(element, color);
3347
3348					element += buffer->bytes;
3349				}
3350
3351				row += buffer->pitchB;
3352			}
3353		}
3354
3355		if(buffer == &internal)
3356		{
3357			unlockInternal();
3358		}
3359		else
3360		{
3361			unlockExternal();
3362		}
3363	}
3364
3365	void Surface::copyInternal(const Surface* source, int x, int y, float srcX, float srcY, bool filter)
3366	{
3367		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
3368
3369		sw::Color<float> color;
3370
3371		if(!filter)
3372		{
3373			color = source->internal.read((int)srcX, (int)srcY);
3374		}
3375		else   // Bilinear filtering
3376		{
3377			color = source->internal.sample(srcX, srcY);
3378		}
3379
3380		internal.write(x, y, color);
3381	}
3382
3383	void Surface::copyInternal(const Surface* source, int x, int y, int z, float srcX, float srcY, float srcZ, bool filter)
3384	{
3385		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
3386
3387		sw::Color<float> color;
3388
3389		if(!filter)
3390		{
3391			color = source->internal.read((int)srcX, (int)srcY, int(srcZ));
3392		}
3393		else   // Bilinear filtering
3394		{
3395			color = source->internal.sample(srcX, srcY, srcZ);
3396		}
3397
3398		internal.write(x, y, z, color);
3399	}
3400
3401	bool Surface::hasStencil() const
3402	{
3403		return isStencil(external.format);
3404	}
3405
3406	bool Surface::hasDepth() const
3407	{
3408		return isDepth(external.format);
3409	}
3410
3411	bool Surface::hasPalette() const
3412	{
3413		return isPalette(external.format);
3414	}
3415
3416	bool Surface::isRenderTarget() const
3417	{
3418		return renderTarget;
3419	}
3420
3421	bool Surface::hasDirtyMipmaps() const
3422	{
3423		return dirtyMipmaps;
3424	}
3425
3426	void Surface::cleanMipmaps()
3427	{
3428		dirtyMipmaps = false;
3429	}
3430
3431	Resource *Surface::getResource()
3432	{
3433		return resource;
3434	}
3435
3436	bool Surface::identicalFormats() const
3437	{
3438		return external.format == internal.format &&
3439		       external.width  == internal.width &&
3440		       external.height == internal.height &&
3441		       external.depth  == internal.depth &&
3442		       external.pitchB == internal.pitchB &&
3443		       external.sliceB == internal.sliceB;
3444	}
3445
3446	Format Surface::selectInternalFormat(Format format) const
3447	{
3448		switch(format)
3449		{
3450		case FORMAT_NULL:
3451			return FORMAT_NULL;
3452		case FORMAT_P8:
3453		case FORMAT_A8P8:
3454		case FORMAT_A4R4G4B4:
3455		case FORMAT_A1R5G5B5:
3456		case FORMAT_A8R3G3B2:
3457			return FORMAT_A8R8G8B8;
3458		case FORMAT_A8:
3459			return FORMAT_A8;
3460		case FORMAT_R8I:
3461			return FORMAT_R8I;
3462		case FORMAT_R8UI:
3463			return FORMAT_R8UI;
3464		case FORMAT_R8I_SNORM:
3465			return FORMAT_R8I_SNORM;
3466		case FORMAT_R8:
3467			return FORMAT_R8;
3468		case FORMAT_R16I:
3469			return FORMAT_R16I;
3470		case FORMAT_R16UI:
3471			return FORMAT_R16UI;
3472		case FORMAT_R32I:
3473			return FORMAT_R32I;
3474		case FORMAT_R32UI:
3475			return FORMAT_R32UI;
3476		case FORMAT_X16B16G16R16I:
3477		case FORMAT_A16B16G16R16I:
3478			return FORMAT_A16B16G16R16I;
3479		case FORMAT_X16B16G16R16UI:
3480		case FORMAT_A16B16G16R16UI:
3481			return FORMAT_A16B16G16R16UI;
3482		case FORMAT_A2R10G10B10:
3483		case FORMAT_A2B10G10R10:
3484		case FORMAT_A16B16G16R16:
3485			return FORMAT_A16B16G16R16;
3486		case FORMAT_X32B32G32R32I:
3487		case FORMAT_A32B32G32R32I:
3488			return FORMAT_A32B32G32R32I;
3489		case FORMAT_X32B32G32R32UI:
3490		case FORMAT_A32B32G32R32UI:
3491			return FORMAT_A32B32G32R32UI;
3492		case FORMAT_G8R8I:
3493			return FORMAT_G8R8I;
3494		case FORMAT_G8R8UI:
3495			return FORMAT_G8R8UI;
3496		case FORMAT_G8R8I_SNORM:
3497			return FORMAT_G8R8I_SNORM;
3498		case FORMAT_G8R8:
3499			return FORMAT_G8R8;
3500		case FORMAT_G16R16I:
3501			return FORMAT_G16R16I;
3502		case FORMAT_G16R16UI:
3503			return FORMAT_G16R16UI;
3504		case FORMAT_G16R16:
3505			return FORMAT_G16R16;
3506		case FORMAT_G32R32I:
3507			return FORMAT_G32R32I;
3508		case FORMAT_G32R32UI:
3509			return FORMAT_G32R32UI;
3510		case FORMAT_A8R8G8B8:
3511			if(lockable || !quadLayoutEnabled)
3512			{
3513				return FORMAT_A8R8G8B8;
3514			}
3515			else
3516			{
3517				return FORMAT_A8G8R8B8Q;
3518			}
3519		case FORMAT_A8B8G8R8I:
3520			return FORMAT_A8B8G8R8I;
3521		case FORMAT_A8B8G8R8UI:
3522			return FORMAT_A8B8G8R8UI;
3523		case FORMAT_A8B8G8R8I_SNORM:
3524			return FORMAT_A8B8G8R8I_SNORM;
3525		case FORMAT_R5G5B5A1:
3526		case FORMAT_R4G4B4A4:
3527		case FORMAT_A8B8G8R8:
3528			return FORMAT_A8B8G8R8;
3529		case FORMAT_R5G6B5:
3530			return FORMAT_R5G6B5;
3531		case FORMAT_R3G3B2:
3532		case FORMAT_R8G8B8:
3533		case FORMAT_X4R4G4B4:
3534		case FORMAT_X1R5G5B5:
3535		case FORMAT_X8R8G8B8:
3536			if(lockable || !quadLayoutEnabled)
3537			{
3538				return FORMAT_X8R8G8B8;
3539			}
3540			else
3541			{
3542				return FORMAT_X8G8R8B8Q;
3543			}
3544		case FORMAT_X8B8G8R8I:
3545			return FORMAT_X8B8G8R8I;
3546		case FORMAT_X8B8G8R8UI:
3547			return FORMAT_X8B8G8R8UI;
3548		case FORMAT_X8B8G8R8I_SNORM:
3549			return FORMAT_X8B8G8R8I_SNORM;
3550		case FORMAT_B8G8R8:
3551		case FORMAT_X8B8G8R8:
3552			return FORMAT_X8B8G8R8;
3553		case FORMAT_SRGB8_X8:
3554			return FORMAT_SRGB8_X8;
3555		case FORMAT_SRGB8_A8:
3556			return FORMAT_SRGB8_A8;
3557		// Compressed formats
3558		#if S3TC_SUPPORT
3559		case FORMAT_DXT1:
3560		case FORMAT_DXT3:
3561		case FORMAT_DXT5:
3562		#endif
3563		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3564		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3565		case FORMAT_RGBA8_ETC2_EAC:
3566		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
3567		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
3568		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
3569		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
3570		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
3571		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
3572		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
3573		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
3574		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
3575		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
3576		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
3577		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
3578		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
3579		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
3580		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
3581			return FORMAT_A8R8G8B8;
3582		case FORMAT_RGBA_ASTC_4x4_KHR:
3583		case FORMAT_RGBA_ASTC_5x4_KHR:
3584		case FORMAT_RGBA_ASTC_5x5_KHR:
3585		case FORMAT_RGBA_ASTC_6x5_KHR:
3586		case FORMAT_RGBA_ASTC_6x6_KHR:
3587		case FORMAT_RGBA_ASTC_8x5_KHR:
3588		case FORMAT_RGBA_ASTC_8x6_KHR:
3589		case FORMAT_RGBA_ASTC_8x8_KHR:
3590		case FORMAT_RGBA_ASTC_10x5_KHR:
3591		case FORMAT_RGBA_ASTC_10x6_KHR:
3592		case FORMAT_RGBA_ASTC_10x8_KHR:
3593		case FORMAT_RGBA_ASTC_10x10_KHR:
3594		case FORMAT_RGBA_ASTC_12x10_KHR:
3595		case FORMAT_RGBA_ASTC_12x12_KHR:
3596			// ASTC supports HDR, so a floating point format is required to represent it properly
3597			return FORMAT_A32B32G32R32F; // FIXME: 16FP is probably sufficient, but it's currently unsupported
3598		case FORMAT_ATI1:
3599		case FORMAT_R11_EAC:
3600			return FORMAT_R8;
3601		case FORMAT_SIGNED_R11_EAC:
3602			return FORMAT_R32F; // FIXME: Signed 8bit format would be sufficient
3603		case FORMAT_ATI2:
3604		case FORMAT_RG11_EAC:
3605			return FORMAT_G8R8;
3606		case FORMAT_SIGNED_RG11_EAC:
3607			return FORMAT_G32R32F; // FIXME: Signed 8bit format would be sufficient
3608		case FORMAT_ETC1:
3609		case FORMAT_RGB8_ETC2:
3610		case FORMAT_SRGB8_ETC2:
3611			return FORMAT_X8R8G8B8;
3612		// Bumpmap formats
3613		case FORMAT_V8U8:			return FORMAT_V8U8;
3614		case FORMAT_L6V5U5:			return FORMAT_X8L8V8U8;
3615		case FORMAT_Q8W8V8U8:		return FORMAT_Q8W8V8U8;
3616		case FORMAT_X8L8V8U8:		return FORMAT_X8L8V8U8;
3617		case FORMAT_V16U16:			return FORMAT_V16U16;
3618		case FORMAT_A2W10V10U10:	return FORMAT_A16W16V16U16;
3619		case FORMAT_Q16W16V16U16:	return FORMAT_Q16W16V16U16;
3620		// Floating-point formats
3621		case FORMAT_A16F:			return FORMAT_A32B32G32R32F;
3622		case FORMAT_R16F:			return FORMAT_R32F;
3623		case FORMAT_G16R16F:		return FORMAT_G32R32F;
3624		case FORMAT_B16G16R16F:     return FORMAT_X32B32G32R32F;
3625		case FORMAT_A16B16G16R16F:	return FORMAT_A32B32G32R32F;
3626		case FORMAT_A32F:			return FORMAT_A32B32G32R32F;
3627		case FORMAT_R32F:			return FORMAT_R32F;
3628		case FORMAT_G32R32F:		return FORMAT_G32R32F;
3629		case FORMAT_B32G32R32F:     return FORMAT_X32B32G32R32F;
3630		case FORMAT_X32B32G32R32F:  return FORMAT_X32B32G32R32F;
3631		case FORMAT_A32B32G32R32F:	return FORMAT_A32B32G32R32F;
3632		// Luminance formats
3633		case FORMAT_L8:				return FORMAT_L8;
3634		case FORMAT_A4L4:			return FORMAT_A8L8;
3635		case FORMAT_L16:			return FORMAT_L16;
3636		case FORMAT_A8L8:			return FORMAT_A8L8;
3637		case FORMAT_L16F:           return FORMAT_X32B32G32R32F;
3638		case FORMAT_A16L16F:        return FORMAT_A32B32G32R32F;
3639		case FORMAT_L32F:           return FORMAT_X32B32G32R32F;
3640		case FORMAT_A32L32F:        return FORMAT_A32B32G32R32F;
3641		// Depth/stencil formats
3642		case FORMAT_D16:
3643		case FORMAT_D32:
3644		case FORMAT_D24X8:
3645		case FORMAT_D24S8:
3646		case FORMAT_D24FS8:
3647			if(hasParent)   // Texture
3648			{
3649				return FORMAT_D32FS8_SHADOW;
3650			}
3651			else if(complementaryDepthBuffer)
3652			{
3653				return FORMAT_D32F_COMPLEMENTARY;
3654			}
3655			else
3656			{
3657				return FORMAT_D32F;
3658			}
3659		case FORMAT_D32F:           return FORMAT_D32F;
3660		case FORMAT_D32F_LOCKABLE:  return FORMAT_D32F_LOCKABLE;
3661		case FORMAT_D32FS8_TEXTURE: return FORMAT_D32FS8_TEXTURE;
3662		case FORMAT_INTZ:           return FORMAT_D32FS8_TEXTURE;
3663		case FORMAT_DF24S8:         return FORMAT_D32FS8_SHADOW;
3664		case FORMAT_DF16S8:         return FORMAT_D32FS8_SHADOW;
3665		case FORMAT_YV12_BT601:     return FORMAT_YV12_BT601;
3666		case FORMAT_YV12_BT709:     return FORMAT_YV12_BT709;
3667		case FORMAT_YV12_JFIF:      return FORMAT_YV12_JFIF;
3668		default:
3669			ASSERT(false);
3670		}
3671
3672		return FORMAT_NULL;
3673	}
3674
3675	void Surface::setTexturePalette(unsigned int *palette)
3676	{
3677		Surface::palette = palette;
3678		Surface::paletteID++;
3679	}
3680
3681	void Surface::resolve()
3682	{
3683		if(internal.depth <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL)
3684		{
3685			return;
3686		}
3687
3688		void *source = internal.lockRect(0, 0, 0, LOCK_READWRITE);
3689
3690		int quality = internal.depth;
3691		int width = internal.width;
3692		int height = internal.height;
3693		int pitch = internal.pitchB;
3694		int slice = internal.sliceB;
3695
3696		unsigned char *source0 = (unsigned char*)source;
3697		unsigned char *source1 = source0 + slice;
3698		unsigned char *source2 = source1 + slice;
3699		unsigned char *source3 = source2 + slice;
3700		unsigned char *source4 = source3 + slice;
3701		unsigned char *source5 = source4 + slice;
3702		unsigned char *source6 = source5 + slice;
3703		unsigned char *source7 = source6 + slice;
3704		unsigned char *source8 = source7 + slice;
3705		unsigned char *source9 = source8 + slice;
3706		unsigned char *sourceA = source9 + slice;
3707		unsigned char *sourceB = sourceA + slice;
3708		unsigned char *sourceC = sourceB + slice;
3709		unsigned char *sourceD = sourceC + slice;
3710		unsigned char *sourceE = sourceD + slice;
3711		unsigned char *sourceF = sourceE + slice;
3712
3713		if(internal.format == FORMAT_X8R8G8B8 || internal.format == FORMAT_A8R8G8B8 ||
3714		   internal.format == FORMAT_X8B8G8R8 || internal.format == FORMAT_A8B8G8R8 ||
3715		   internal.format == FORMAT_SRGB8_X8 || internal.format == FORMAT_SRGB8_A8)
3716		{
3717			if(CPUID::supportsSSE2() && (width % 4) == 0)
3718			{
3719				if(internal.depth == 2)
3720				{
3721					for(int y = 0; y < height; y++)
3722					{
3723						for(int x = 0; x < width; x += 4)
3724						{
3725							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3726							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3727
3728							c0 = _mm_avg_epu8(c0, c1);
3729
3730							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3731						}
3732
3733						source0 += pitch;
3734						source1 += pitch;
3735					}
3736				}
3737				else if(internal.depth == 4)
3738				{
3739					for(int y = 0; y < height; y++)
3740					{
3741						for(int x = 0; x < width; x += 4)
3742						{
3743							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3744							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3745							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3746							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3747
3748							c0 = _mm_avg_epu8(c0, c1);
3749							c2 = _mm_avg_epu8(c2, c3);
3750							c0 = _mm_avg_epu8(c0, c2);
3751
3752							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3753						}
3754
3755						source0 += pitch;
3756						source1 += pitch;
3757						source2 += pitch;
3758						source3 += pitch;
3759					}
3760				}
3761				else if(internal.depth == 8)
3762				{
3763					for(int y = 0; y < height; y++)
3764					{
3765						for(int x = 0; x < width; x += 4)
3766						{
3767							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3768							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3769							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3770							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3771							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3772							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3773							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3774							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3775
3776							c0 = _mm_avg_epu8(c0, c1);
3777							c2 = _mm_avg_epu8(c2, c3);
3778							c4 = _mm_avg_epu8(c4, c5);
3779							c6 = _mm_avg_epu8(c6, c7);
3780							c0 = _mm_avg_epu8(c0, c2);
3781							c4 = _mm_avg_epu8(c4, c6);
3782							c0 = _mm_avg_epu8(c0, c4);
3783
3784							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3785						}
3786
3787						source0 += pitch;
3788						source1 += pitch;
3789						source2 += pitch;
3790						source3 += pitch;
3791						source4 += pitch;
3792						source5 += pitch;
3793						source6 += pitch;
3794						source7 += pitch;
3795					}
3796				}
3797				else if(internal.depth == 16)
3798				{
3799					for(int y = 0; y < height; y++)
3800					{
3801						for(int x = 0; x < width; x += 4)
3802						{
3803							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3804							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3805							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3806							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3807							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3808							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3809							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3810							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3811							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
3812							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
3813							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
3814							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
3815							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
3816							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
3817							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
3818							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
3819
3820							c0 = _mm_avg_epu8(c0, c1);
3821							c2 = _mm_avg_epu8(c2, c3);
3822							c4 = _mm_avg_epu8(c4, c5);
3823							c6 = _mm_avg_epu8(c6, c7);
3824							c8 = _mm_avg_epu8(c8, c9);
3825							cA = _mm_avg_epu8(cA, cB);
3826							cC = _mm_avg_epu8(cC, cD);
3827							cE = _mm_avg_epu8(cE, cF);
3828							c0 = _mm_avg_epu8(c0, c2);
3829							c4 = _mm_avg_epu8(c4, c6);
3830							c8 = _mm_avg_epu8(c8, cA);
3831							cC = _mm_avg_epu8(cC, cE);
3832							c0 = _mm_avg_epu8(c0, c4);
3833							c8 = _mm_avg_epu8(c8, cC);
3834							c0 = _mm_avg_epu8(c0, c8);
3835
3836							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3837						}
3838
3839						source0 += pitch;
3840						source1 += pitch;
3841						source2 += pitch;
3842						source3 += pitch;
3843						source4 += pitch;
3844						source5 += pitch;
3845						source6 += pitch;
3846						source7 += pitch;
3847						source8 += pitch;
3848						source9 += pitch;
3849						sourceA += pitch;
3850						sourceB += pitch;
3851						sourceC += pitch;
3852						sourceD += pitch;
3853						sourceE += pitch;
3854						sourceF += pitch;
3855					}
3856				}
3857				else ASSERT(false);
3858			}
3859			else
3860			{
3861				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))
3862
3863				if(internal.depth == 2)
3864				{
3865					for(int y = 0; y < height; y++)
3866					{
3867						for(int x = 0; x < width; x++)
3868						{
3869							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3870							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3871
3872							c0 = AVERAGE(c0, c1);
3873
3874							*(unsigned int*)(source0 + 4 * x) = c0;
3875						}
3876
3877						source0 += pitch;
3878						source1 += pitch;
3879					}
3880				}
3881				else if(internal.depth == 4)
3882				{
3883					for(int y = 0; y < height; y++)
3884					{
3885						for(int x = 0; x < width; x++)
3886						{
3887							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3888							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3889							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3890							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3891
3892							c0 = AVERAGE(c0, c1);
3893							c2 = AVERAGE(c2, c3);
3894							c0 = AVERAGE(c0, c2);
3895
3896							*(unsigned int*)(source0 + 4 * x) = c0;
3897						}
3898
3899						source0 += pitch;
3900						source1 += pitch;
3901						source2 += pitch;
3902						source3 += pitch;
3903					}
3904				}
3905				else if(internal.depth == 8)
3906				{
3907					for(int y = 0; y < height; y++)
3908					{
3909						for(int x = 0; x < width; x++)
3910						{
3911							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3912							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3913							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3914							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3915							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3916							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3917							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3918							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3919
3920							c0 = AVERAGE(c0, c1);
3921							c2 = AVERAGE(c2, c3);
3922							c4 = AVERAGE(c4, c5);
3923							c6 = AVERAGE(c6, c7);
3924							c0 = AVERAGE(c0, c2);
3925							c4 = AVERAGE(c4, c6);
3926							c0 = AVERAGE(c0, c4);
3927
3928							*(unsigned int*)(source0 + 4 * x) = c0;
3929						}
3930
3931						source0 += pitch;
3932						source1 += pitch;
3933						source2 += pitch;
3934						source3 += pitch;
3935						source4 += pitch;
3936						source5 += pitch;
3937						source6 += pitch;
3938						source7 += pitch;
3939					}
3940				}
3941				else if(internal.depth == 16)
3942				{
3943					for(int y = 0; y < height; y++)
3944					{
3945						for(int x = 0; x < width; x++)
3946						{
3947							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3948							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3949							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3950							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3951							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3952							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3953							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3954							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3955							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
3956							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
3957							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
3958							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
3959							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
3960							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
3961							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
3962							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
3963
3964							c0 = AVERAGE(c0, c1);
3965							c2 = AVERAGE(c2, c3);
3966							c4 = AVERAGE(c4, c5);
3967							c6 = AVERAGE(c6, c7);
3968							c8 = AVERAGE(c8, c9);
3969							cA = AVERAGE(cA, cB);
3970							cC = AVERAGE(cC, cD);
3971							cE = AVERAGE(cE, cF);
3972							c0 = AVERAGE(c0, c2);
3973							c4 = AVERAGE(c4, c6);
3974							c8 = AVERAGE(c8, cA);
3975							cC = AVERAGE(cC, cE);
3976							c0 = AVERAGE(c0, c4);
3977							c8 = AVERAGE(c8, cC);
3978							c0 = AVERAGE(c0, c8);
3979
3980							*(unsigned int*)(source0 + 4 * x) = c0;
3981						}
3982
3983						source0 += pitch;
3984						source1 += pitch;
3985						source2 += pitch;
3986						source3 += pitch;
3987						source4 += pitch;
3988						source5 += pitch;
3989						source6 += pitch;
3990						source7 += pitch;
3991						source8 += pitch;
3992						source9 += pitch;
3993						sourceA += pitch;
3994						sourceB += pitch;
3995						sourceC += pitch;
3996						sourceD += pitch;
3997						sourceE += pitch;
3998						sourceF += pitch;
3999					}
4000				}
4001				else ASSERT(false);
4002
4003				#undef AVERAGE
4004			}
4005		}
4006		else if(internal.format == FORMAT_G16R16)
4007		{
4008			if(CPUID::supportsSSE2() && (width % 4) == 0)
4009			{
4010				if(internal.depth == 2)
4011				{
4012					for(int y = 0; y < height; y++)
4013					{
4014						for(int x = 0; x < width; x += 4)
4015						{
4016							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4017							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4018
4019							c0 = _mm_avg_epu16(c0, c1);
4020
4021							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4022						}
4023
4024						source0 += pitch;
4025						source1 += pitch;
4026					}
4027				}
4028				else if(internal.depth == 4)
4029				{
4030					for(int y = 0; y < height; y++)
4031					{
4032						for(int x = 0; x < width; x += 4)
4033						{
4034							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4035							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4036							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4037							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4038
4039							c0 = _mm_avg_epu16(c0, c1);
4040							c2 = _mm_avg_epu16(c2, c3);
4041							c0 = _mm_avg_epu16(c0, c2);
4042
4043							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4044						}
4045
4046						source0 += pitch;
4047						source1 += pitch;
4048						source2 += pitch;
4049						source3 += pitch;
4050					}
4051				}
4052				else if(internal.depth == 8)
4053				{
4054					for(int y = 0; y < height; y++)
4055					{
4056						for(int x = 0; x < width; x += 4)
4057						{
4058							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4059							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4060							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4061							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4062							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4063							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4064							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4065							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4066
4067							c0 = _mm_avg_epu16(c0, c1);
4068							c2 = _mm_avg_epu16(c2, c3);
4069							c4 = _mm_avg_epu16(c4, c5);
4070							c6 = _mm_avg_epu16(c6, c7);
4071							c0 = _mm_avg_epu16(c0, c2);
4072							c4 = _mm_avg_epu16(c4, c6);
4073							c0 = _mm_avg_epu16(c0, c4);
4074
4075							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4076						}
4077
4078						source0 += pitch;
4079						source1 += pitch;
4080						source2 += pitch;
4081						source3 += pitch;
4082						source4 += pitch;
4083						source5 += pitch;
4084						source6 += pitch;
4085						source7 += pitch;
4086					}
4087				}
4088				else if(internal.depth == 16)
4089				{
4090					for(int y = 0; y < height; y++)
4091					{
4092						for(int x = 0; x < width; x += 4)
4093						{
4094							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4095							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4096							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4097							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4098							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4099							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4100							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4101							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4102							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
4103							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
4104							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
4105							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
4106							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
4107							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
4108							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
4109							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
4110
4111							c0 = _mm_avg_epu16(c0, c1);
4112							c2 = _mm_avg_epu16(c2, c3);
4113							c4 = _mm_avg_epu16(c4, c5);
4114							c6 = _mm_avg_epu16(c6, c7);
4115							c8 = _mm_avg_epu16(c8, c9);
4116							cA = _mm_avg_epu16(cA, cB);
4117							cC = _mm_avg_epu16(cC, cD);
4118							cE = _mm_avg_epu16(cE, cF);
4119							c0 = _mm_avg_epu16(c0, c2);
4120							c4 = _mm_avg_epu16(c4, c6);
4121							c8 = _mm_avg_epu16(c8, cA);
4122							cC = _mm_avg_epu16(cC, cE);
4123							c0 = _mm_avg_epu16(c0, c4);
4124							c8 = _mm_avg_epu16(c8, cC);
4125							c0 = _mm_avg_epu16(c0, c8);
4126
4127							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4128						}
4129
4130						source0 += pitch;
4131						source1 += pitch;
4132						source2 += pitch;
4133						source3 += pitch;
4134						source4 += pitch;
4135						source5 += pitch;
4136						source6 += pitch;
4137						source7 += pitch;
4138						source8 += pitch;
4139						source9 += pitch;
4140						sourceA += pitch;
4141						sourceB += pitch;
4142						sourceC += pitch;
4143						sourceD += pitch;
4144						sourceE += pitch;
4145						sourceF += pitch;
4146					}
4147				}
4148				else ASSERT(false);
4149			}
4150			else
4151			{
4152				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
4153
4154				if(internal.depth == 2)
4155				{
4156					for(int y = 0; y < height; y++)
4157					{
4158						for(int x = 0; x < width; x++)
4159						{
4160							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4161							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4162
4163							c0 = AVERAGE(c0, c1);
4164
4165							*(unsigned int*)(source0 + 4 * x) = c0;
4166						}
4167
4168						source0 += pitch;
4169						source1 += pitch;
4170					}
4171				}
4172				else if(internal.depth == 4)
4173				{
4174					for(int y = 0; y < height; y++)
4175					{
4176						for(int x = 0; x < width; x++)
4177						{
4178							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4179							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4180							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4181							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4182
4183							c0 = AVERAGE(c0, c1);
4184							c2 = AVERAGE(c2, c3);
4185							c0 = AVERAGE(c0, c2);
4186
4187							*(unsigned int*)(source0 + 4 * x) = c0;
4188						}
4189
4190						source0 += pitch;
4191						source1 += pitch;
4192						source2 += pitch;
4193						source3 += pitch;
4194					}
4195				}
4196				else if(internal.depth == 8)
4197				{
4198					for(int y = 0; y < height; y++)
4199					{
4200						for(int x = 0; x < width; x++)
4201						{
4202							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4203							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4204							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4205							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4206							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4207							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4208							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4209							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4210
4211							c0 = AVERAGE(c0, c1);
4212							c2 = AVERAGE(c2, c3);
4213							c4 = AVERAGE(c4, c5);
4214							c6 = AVERAGE(c6, c7);
4215							c0 = AVERAGE(c0, c2);
4216							c4 = AVERAGE(c4, c6);
4217							c0 = AVERAGE(c0, c4);
4218
4219							*(unsigned int*)(source0 + 4 * x) = c0;
4220						}
4221
4222						source0 += pitch;
4223						source1 += pitch;
4224						source2 += pitch;
4225						source3 += pitch;
4226						source4 += pitch;
4227						source5 += pitch;
4228						source6 += pitch;
4229						source7 += pitch;
4230					}
4231				}
4232				else if(internal.depth == 16)
4233				{
4234					for(int y = 0; y < height; y++)
4235					{
4236						for(int x = 0; x < width; x++)
4237						{
4238							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4239							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4240							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4241							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4242							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4243							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4244							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4245							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4246							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4247							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4248							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4249							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4250							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4251							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4252							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4253							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4254
4255							c0 = AVERAGE(c0, c1);
4256							c2 = AVERAGE(c2, c3);
4257							c4 = AVERAGE(c4, c5);
4258							c6 = AVERAGE(c6, c7);
4259							c8 = AVERAGE(c8, c9);
4260							cA = AVERAGE(cA, cB);
4261							cC = AVERAGE(cC, cD);
4262							cE = AVERAGE(cE, cF);
4263							c0 = AVERAGE(c0, c2);
4264							c4 = AVERAGE(c4, c6);
4265							c8 = AVERAGE(c8, cA);
4266							cC = AVERAGE(cC, cE);
4267							c0 = AVERAGE(c0, c4);
4268							c8 = AVERAGE(c8, cC);
4269							c0 = AVERAGE(c0, c8);
4270
4271							*(unsigned int*)(source0 + 4 * x) = c0;
4272						}
4273
4274						source0 += pitch;
4275						source1 += pitch;
4276						source2 += pitch;
4277						source3 += pitch;
4278						source4 += pitch;
4279						source5 += pitch;
4280						source6 += pitch;
4281						source7 += pitch;
4282						source8 += pitch;
4283						source9 += pitch;
4284						sourceA += pitch;
4285						sourceB += pitch;
4286						sourceC += pitch;
4287						sourceD += pitch;
4288						sourceE += pitch;
4289						sourceF += pitch;
4290					}
4291				}
4292				else ASSERT(false);
4293
4294				#undef AVERAGE
4295			}
4296		}
4297		else if(internal.format == FORMAT_A16B16G16R16)
4298		{
4299			if(CPUID::supportsSSE2() && (width % 2) == 0)
4300			{
4301				if(internal.depth == 2)
4302				{
4303					for(int y = 0; y < height; y++)
4304					{
4305						for(int x = 0; x < width; x += 2)
4306						{
4307							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4308							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4309
4310							c0 = _mm_avg_epu16(c0, c1);
4311
4312							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4313						}
4314
4315						source0 += pitch;
4316						source1 += pitch;
4317					}
4318				}
4319				else if(internal.depth == 4)
4320				{
4321					for(int y = 0; y < height; y++)
4322					{
4323						for(int x = 0; x < width; x += 2)
4324						{
4325							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4326							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4327							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4328							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4329
4330							c0 = _mm_avg_epu16(c0, c1);
4331							c2 = _mm_avg_epu16(c2, c3);
4332							c0 = _mm_avg_epu16(c0, c2);
4333
4334							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4335						}
4336
4337						source0 += pitch;
4338						source1 += pitch;
4339						source2 += pitch;
4340						source3 += pitch;
4341					}
4342				}
4343				else if(internal.depth == 8)
4344				{
4345					for(int y = 0; y < height; y++)
4346					{
4347						for(int x = 0; x < width; x += 2)
4348						{
4349							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4350							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4351							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4352							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4353							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4354							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4355							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4356							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4357
4358							c0 = _mm_avg_epu16(c0, c1);
4359							c2 = _mm_avg_epu16(c2, c3);
4360							c4 = _mm_avg_epu16(c4, c5);
4361							c6 = _mm_avg_epu16(c6, c7);
4362							c0 = _mm_avg_epu16(c0, c2);
4363							c4 = _mm_avg_epu16(c4, c6);
4364							c0 = _mm_avg_epu16(c0, c4);
4365
4366							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4367						}
4368
4369						source0 += pitch;
4370						source1 += pitch;
4371						source2 += pitch;
4372						source3 += pitch;
4373						source4 += pitch;
4374						source5 += pitch;
4375						source6 += pitch;
4376						source7 += pitch;
4377					}
4378				}
4379				else if(internal.depth == 16)
4380				{
4381					for(int y = 0; y < height; y++)
4382					{
4383						for(int x = 0; x < width; x += 2)
4384						{
4385							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4386							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4387							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4388							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4389							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4390							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4391							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4392							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4393							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
4394							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
4395							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
4396							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
4397							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
4398							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
4399							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
4400							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
4401
4402							c0 = _mm_avg_epu16(c0, c1);
4403							c2 = _mm_avg_epu16(c2, c3);
4404							c4 = _mm_avg_epu16(c4, c5);
4405							c6 = _mm_avg_epu16(c6, c7);
4406							c8 = _mm_avg_epu16(c8, c9);
4407							cA = _mm_avg_epu16(cA, cB);
4408							cC = _mm_avg_epu16(cC, cD);
4409							cE = _mm_avg_epu16(cE, cF);
4410							c0 = _mm_avg_epu16(c0, c2);
4411							c4 = _mm_avg_epu16(c4, c6);
4412							c8 = _mm_avg_epu16(c8, cA);
4413							cC = _mm_avg_epu16(cC, cE);
4414							c0 = _mm_avg_epu16(c0, c4);
4415							c8 = _mm_avg_epu16(c8, cC);
4416							c0 = _mm_avg_epu16(c0, c8);
4417
4418							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4419						}
4420
4421						source0 += pitch;
4422						source1 += pitch;
4423						source2 += pitch;
4424						source3 += pitch;
4425						source4 += pitch;
4426						source5 += pitch;
4427						source6 += pitch;
4428						source7 += pitch;
4429						source8 += pitch;
4430						source9 += pitch;
4431						sourceA += pitch;
4432						sourceB += pitch;
4433						sourceC += pitch;
4434						sourceD += pitch;
4435						sourceE += pitch;
4436						sourceF += pitch;
4437					}
4438				}
4439				else ASSERT(false);
4440			}
4441			else
4442			{
4443				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
4444
4445				if(internal.depth == 2)
4446				{
4447					for(int y = 0; y < height; y++)
4448					{
4449						for(int x = 0; x < 2 * width; x++)
4450						{
4451							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4452							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4453
4454							c0 = AVERAGE(c0, c1);
4455
4456							*(unsigned int*)(source0 + 4 * x) = c0;
4457						}
4458
4459						source0 += pitch;
4460						source1 += pitch;
4461					}
4462				}
4463				else if(internal.depth == 4)
4464				{
4465					for(int y = 0; y < height; y++)
4466					{
4467						for(int x = 0; x < 2 * width; x++)
4468						{
4469							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4470							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4471							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4472							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4473
4474							c0 = AVERAGE(c0, c1);
4475							c2 = AVERAGE(c2, c3);
4476							c0 = AVERAGE(c0, c2);
4477
4478							*(unsigned int*)(source0 + 4 * x) = c0;
4479						}
4480
4481						source0 += pitch;
4482						source1 += pitch;
4483						source2 += pitch;
4484						source3 += pitch;
4485					}
4486				}
4487				else if(internal.depth == 8)
4488				{
4489					for(int y = 0; y < height; y++)
4490					{
4491						for(int x = 0; x < 2 * width; x++)
4492						{
4493							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4494							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4495							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4496							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4497							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4498							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4499							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4500							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4501
4502							c0 = AVERAGE(c0, c1);
4503							c2 = AVERAGE(c2, c3);
4504							c4 = AVERAGE(c4, c5);
4505							c6 = AVERAGE(c6, c7);
4506							c0 = AVERAGE(c0, c2);
4507							c4 = AVERAGE(c4, c6);
4508							c0 = AVERAGE(c0, c4);
4509
4510							*(unsigned int*)(source0 + 4 * x) = c0;
4511						}
4512
4513						source0 += pitch;
4514						source1 += pitch;
4515						source2 += pitch;
4516						source3 += pitch;
4517						source4 += pitch;
4518						source5 += pitch;
4519						source6 += pitch;
4520						source7 += pitch;
4521					}
4522				}
4523				else if(internal.depth == 16)
4524				{
4525					for(int y = 0; y < height; y++)
4526					{
4527						for(int x = 0; x < 2 * width; x++)
4528						{
4529							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4530							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4531							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4532							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4533							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4534							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4535							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4536							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4537							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4538							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4539							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4540							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4541							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4542							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4543							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4544							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4545
4546							c0 = AVERAGE(c0, c1);
4547							c2 = AVERAGE(c2, c3);
4548							c4 = AVERAGE(c4, c5);
4549							c6 = AVERAGE(c6, c7);
4550							c8 = AVERAGE(c8, c9);
4551							cA = AVERAGE(cA, cB);
4552							cC = AVERAGE(cC, cD);
4553							cE = AVERAGE(cE, cF);
4554							c0 = AVERAGE(c0, c2);
4555							c4 = AVERAGE(c4, c6);
4556							c8 = AVERAGE(c8, cA);
4557							cC = AVERAGE(cC, cE);
4558							c0 = AVERAGE(c0, c4);
4559							c8 = AVERAGE(c8, cC);
4560							c0 = AVERAGE(c0, c8);
4561
4562							*(unsigned int*)(source0 + 4 * x) = c0;
4563						}
4564
4565						source0 += pitch;
4566						source1 += pitch;
4567						source2 += pitch;
4568						source3 += pitch;
4569						source4 += pitch;
4570						source5 += pitch;
4571						source6 += pitch;
4572						source7 += pitch;
4573						source8 += pitch;
4574						source9 += pitch;
4575						sourceA += pitch;
4576						sourceB += pitch;
4577						sourceC += pitch;
4578						sourceD += pitch;
4579						sourceE += pitch;
4580						sourceF += pitch;
4581					}
4582				}
4583				else ASSERT(false);
4584
4585				#undef AVERAGE
4586			}
4587		}
4588		else if(internal.format == FORMAT_R32F)
4589		{
4590			if(CPUID::supportsSSE() && (width % 4) == 0)
4591			{
4592				if(internal.depth == 2)
4593				{
4594					for(int y = 0; y < height; y++)
4595					{
4596						for(int x = 0; x < width; x += 4)
4597						{
4598							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4599							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4600
4601							c0 = _mm_add_ps(c0, c1);
4602							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
4603
4604							_mm_store_ps((float*)(source0 + 4 * x), c0);
4605						}
4606
4607						source0 += pitch;
4608						source1 += pitch;
4609					}
4610				}
4611				else if(internal.depth == 4)
4612				{
4613					for(int y = 0; y < height; y++)
4614					{
4615						for(int x = 0; x < width; x += 4)
4616						{
4617							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4618							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4619							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4620							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4621
4622							c0 = _mm_add_ps(c0, c1);
4623							c2 = _mm_add_ps(c2, c3);
4624							c0 = _mm_add_ps(c0, c2);
4625							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
4626
4627							_mm_store_ps((float*)(source0 + 4 * x), c0);
4628						}
4629
4630						source0 += pitch;
4631						source1 += pitch;
4632						source2 += pitch;
4633						source3 += pitch;
4634					}
4635				}
4636				else if(internal.depth == 8)
4637				{
4638					for(int y = 0; y < height; y++)
4639					{
4640						for(int x = 0; x < width; x += 4)
4641						{
4642							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4643							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4644							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4645							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4646							__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
4647							__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
4648							__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
4649							__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
4650
4651							c0 = _mm_add_ps(c0, c1);
4652							c2 = _mm_add_ps(c2, c3);
4653							c4 = _mm_add_ps(c4, c5);
4654							c6 = _mm_add_ps(c6, c7);
4655							c0 = _mm_add_ps(c0, c2);
4656							c4 = _mm_add_ps(c4, c6);
4657							c0 = _mm_add_ps(c0, c4);
4658							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
4659
4660							_mm_store_ps((float*)(source0 + 4 * x), c0);
4661						}
4662
4663						source0 += pitch;
4664						source1 += pitch;
4665						source2 += pitch;
4666						source3 += pitch;
4667						source4 += pitch;
4668						source5 += pitch;
4669						source6 += pitch;
4670						source7 += pitch;
4671					}
4672				}
4673				else if(internal.depth == 16)
4674				{
4675					for(int y = 0; y < height; y++)
4676					{
4677						for(int x = 0; x < width; x += 4)
4678						{
4679							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4680							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4681							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4682							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4683							__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
4684							__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
4685							__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
4686							__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
4687							__m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
4688							__m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
4689							__m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
4690							__m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
4691							__m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
4692							__m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
4693							__m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
4694							__m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
4695
4696							c0 = _mm_add_ps(c0, c1);
4697							c2 = _mm_add_ps(c2, c3);
4698							c4 = _mm_add_ps(c4, c5);
4699							c6 = _mm_add_ps(c6, c7);
4700							c8 = _mm_add_ps(c8, c9);
4701							cA = _mm_add_ps(cA, cB);
4702							cC = _mm_add_ps(cC, cD);
4703							cE = _mm_add_ps(cE, cF);
4704							c0 = _mm_add_ps(c0, c2);
4705							c4 = _mm_add_ps(c4, c6);
4706							c8 = _mm_add_ps(c8, cA);
4707							cC = _mm_add_ps(cC, cE);
4708							c0 = _mm_add_ps(c0, c4);
4709							c8 = _mm_add_ps(c8, cC);
4710							c0 = _mm_add_ps(c0, c8);
4711							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
4712
4713							_mm_store_ps((float*)(source0 + 4 * x), c0);
4714						}
4715
4716						source0 += pitch;
4717						source1 += pitch;
4718						source2 += pitch;
4719						source3 += pitch;
4720						source4 += pitch;
4721						source5 += pitch;
4722						source6 += pitch;
4723						source7 += pitch;
4724						source8 += pitch;
4725						source9 += pitch;
4726						sourceA += pitch;
4727						sourceB += pitch;
4728						sourceC += pitch;
4729						sourceD += pitch;
4730						sourceE += pitch;
4731						sourceF += pitch;
4732					}
4733				}
4734				else ASSERT(false);
4735			}
4736			else
4737			{
4738				if(internal.depth == 2)
4739				{
4740					for(int y = 0; y < height; y++)
4741					{
4742						for(int x = 0; x < width; x++)
4743						{
4744							float c0 = *(float*)(source0 + 4 * x);
4745							float c1 = *(float*)(source1 + 4 * x);
4746
4747							c0 = c0 + c1;
4748							c0 *= 1.0f / 2.0f;
4749
4750							*(float*)(source0 + 4 * x) = c0;
4751						}
4752
4753						source0 += pitch;
4754						source1 += pitch;
4755					}
4756				}
4757				else if(internal.depth == 4)
4758				{
4759					for(int y = 0; y < height; y++)
4760					{
4761						for(int x = 0; x < width; x++)
4762						{
4763							float c0 = *(float*)(source0 + 4 * x);
4764							float c1 = *(float*)(source1 + 4 * x);
4765							float c2 = *(float*)(source2 + 4 * x);
4766							float c3 = *(float*)(source3 + 4 * x);
4767
4768							c0 = c0 + c1;
4769							c2 = c2 + c3;
4770							c0 = c0 + c2;
4771							c0 *= 1.0f / 4.0f;
4772
4773							*(float*)(source0 + 4 * x) = c0;
4774						}
4775
4776						source0 += pitch;
4777						source1 += pitch;
4778						source2 += pitch;
4779						source3 += pitch;
4780					}
4781				}
4782				else if(internal.depth == 8)
4783				{
4784					for(int y = 0; y < height; y++)
4785					{
4786						for(int x = 0; x < width; x++)
4787						{
4788							float c0 = *(float*)(source0 + 4 * x);
4789							float c1 = *(float*)(source1 + 4 * x);
4790							float c2 = *(float*)(source2 + 4 * x);
4791							float c3 = *(float*)(source3 + 4 * x);
4792							float c4 = *(float*)(source4 + 4 * x);
4793							float c5 = *(float*)(source5 + 4 * x);
4794							float c6 = *(float*)(source6 + 4 * x);
4795							float c7 = *(float*)(source7 + 4 * x);
4796
4797							c0 = c0 + c1;
4798							c2 = c2 + c3;
4799							c4 = c4 + c5;
4800							c6 = c6 + c7;
4801							c0 = c0 + c2;
4802							c4 = c4 + c6;
4803							c0 = c0 + c4;
4804							c0 *= 1.0f / 8.0f;
4805
4806							*(float*)(source0 + 4 * x) = c0;
4807						}
4808
4809						source0 += pitch;
4810						source1 += pitch;
4811						source2 += pitch;
4812						source3 += pitch;
4813						source4 += pitch;
4814						source5 += pitch;
4815						source6 += pitch;
4816						source7 += pitch;
4817					}
4818				}
4819				else if(internal.depth == 16)
4820				{
4821					for(int y = 0; y < height; y++)
4822					{
4823						for(int x = 0; x < width; x++)
4824						{
4825							float c0 = *(float*)(source0 + 4 * x);
4826							float c1 = *(float*)(source1 + 4 * x);
4827							float c2 = *(float*)(source2 + 4 * x);
4828							float c3 = *(float*)(source3 + 4 * x);
4829							float c4 = *(float*)(source4 + 4 * x);
4830							float c5 = *(float*)(source5 + 4 * x);
4831							float c6 = *(float*)(source6 + 4 * x);
4832							float c7 = *(float*)(source7 + 4 * x);
4833							float c8 = *(float*)(source8 + 4 * x);
4834							float c9 = *(float*)(source9 + 4 * x);
4835							float cA = *(float*)(sourceA + 4 * x);
4836							float cB = *(float*)(sourceB + 4 * x);
4837							float cC = *(float*)(sourceC + 4 * x);
4838							float cD = *(float*)(sourceD + 4 * x);
4839							float cE = *(float*)(sourceE + 4 * x);
4840							float cF = *(float*)(sourceF + 4 * x);
4841
4842							c0 = c0 + c1;
4843							c2 = c2 + c3;
4844							c4 = c4 + c5;
4845							c6 = c6 + c7;
4846							c8 = c8 + c9;
4847							cA = cA + cB;
4848							cC = cC + cD;
4849							cE = cE + cF;
4850							c0 = c0 + c2;
4851							c4 = c4 + c6;
4852							c8 = c8 + cA;
4853							cC = cC + cE;
4854							c0 = c0 + c4;
4855							c8 = c8 + cC;
4856							c0 = c0 + c8;
4857							c0 *= 1.0f / 16.0f;
4858
4859							*(float*)(source0 + 4 * x) = c0;
4860						}
4861
4862						source0 += pitch;
4863						source1 += pitch;
4864						source2 += pitch;
4865						source3 += pitch;
4866						source4 += pitch;
4867						source5 += pitch;
4868						source6 += pitch;
4869						source7 += pitch;
4870						source8 += pitch;
4871						source9 += pitch;
4872						sourceA += pitch;
4873						sourceB += pitch;
4874						sourceC += pitch;
4875						sourceD += pitch;
4876						sourceE += pitch;
4877						sourceF += pitch;
4878					}
4879				}
4880				else ASSERT(false);
4881			}
4882		}
4883		else if(internal.format == FORMAT_G32R32F)
4884		{
4885			if(CPUID::supportsSSE() && (width % 2) == 0)
4886			{
4887				if(internal.depth == 2)
4888				{
4889					for(int y = 0; y < height; y++)
4890					{
4891						for(int x = 0; x < width; x += 2)
4892						{
4893							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4894							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4895
4896							c0 = _mm_add_ps(c0, c1);
4897							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
4898
4899							_mm_store_ps((float*)(source0 + 8 * x), c0);
4900						}
4901
4902						source0 += pitch;
4903						source1 += pitch;
4904					}
4905				}
4906				else if(internal.depth == 4)
4907				{
4908					for(int y = 0; y < height; y++)
4909					{
4910						for(int x = 0; x < width; x += 2)
4911						{
4912							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4913							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4914							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4915							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4916
4917							c0 = _mm_add_ps(c0, c1);
4918							c2 = _mm_add_ps(c2, c3);
4919							c0 = _mm_add_ps(c0, c2);
4920							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
4921
4922							_mm_store_ps((float*)(source0 + 8 * x), c0);
4923						}
4924
4925						source0 += pitch;
4926						source1 += pitch;
4927						source2 += pitch;
4928						source3 += pitch;
4929					}
4930				}
4931				else if(internal.depth == 8)
4932				{
4933					for(int y = 0; y < height; y++)
4934					{
4935						for(int x = 0; x < width; x += 2)
4936						{
4937							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4938							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4939							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4940							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4941							__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
4942							__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
4943							__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
4944							__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
4945
4946							c0 = _mm_add_ps(c0, c1);
4947							c2 = _mm_add_ps(c2, c3);
4948							c4 = _mm_add_ps(c4, c5);
4949							c6 = _mm_add_ps(c6, c7);
4950							c0 = _mm_add_ps(c0, c2);
4951							c4 = _mm_add_ps(c4, c6);
4952							c0 = _mm_add_ps(c0, c4);
4953							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
4954
4955							_mm_store_ps((float*)(source0 + 8 * x), c0);
4956						}
4957
4958						source0 += pitch;
4959						source1 += pitch;
4960						source2 += pitch;
4961						source3 += pitch;
4962						source4 += pitch;
4963						source5 += pitch;
4964						source6 += pitch;
4965						source7 += pitch;
4966					}
4967				}
4968				else if(internal.depth == 16)
4969				{
4970					for(int y = 0; y < height; y++)
4971					{
4972						for(int x = 0; x < width; x += 2)
4973						{
4974							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4975							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4976							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4977							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4978							__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
4979							__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
4980							__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
4981							__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
4982							__m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
4983							__m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
4984							__m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
4985							__m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
4986							__m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
4987							__m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
4988							__m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
4989							__m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
4990
4991							c0 = _mm_add_ps(c0, c1);
4992							c2 = _mm_add_ps(c2, c3);
4993							c4 = _mm_add_ps(c4, c5);
4994							c6 = _mm_add_ps(c6, c7);
4995							c8 = _mm_add_ps(c8, c9);
4996							cA = _mm_add_ps(cA, cB);
4997							cC = _mm_add_ps(cC, cD);
4998							cE = _mm_add_ps(cE, cF);
4999							c0 = _mm_add_ps(c0, c2);
5000							c4 = _mm_add_ps(c4, c6);
5001							c8 = _mm_add_ps(c8, cA);
5002							cC = _mm_add_ps(cC, cE);
5003							c0 = _mm_add_ps(c0, c4);
5004							c8 = _mm_add_ps(c8, cC);
5005							c0 = _mm_add_ps(c0, c8);
5006							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5007
5008							_mm_store_ps((float*)(source0 + 8 * x), c0);
5009						}
5010
5011						source0 += pitch;
5012						source1 += pitch;
5013						source2 += pitch;
5014						source3 += pitch;
5015						source4 += pitch;
5016						source5 += pitch;
5017						source6 += pitch;
5018						source7 += pitch;
5019						source8 += pitch;
5020						source9 += pitch;
5021						sourceA += pitch;
5022						sourceB += pitch;
5023						sourceC += pitch;
5024						sourceD += pitch;
5025						sourceE += pitch;
5026						sourceF += pitch;
5027					}
5028				}
5029				else ASSERT(false);
5030			}
5031			else
5032			{
5033				if(internal.depth == 2)
5034				{
5035					for(int y = 0; y < height; y++)
5036					{
5037						for(int x = 0; x < 2 * width; x++)
5038						{
5039							float c0 = *(float*)(source0 + 4 * x);
5040							float c1 = *(float*)(source1 + 4 * x);
5041
5042							c0 = c0 + c1;
5043							c0 *= 1.0f / 2.0f;
5044
5045							*(float*)(source0 + 4 * x) = c0;
5046						}
5047
5048						source0 += pitch;
5049						source1 += pitch;
5050					}
5051				}
5052				else if(internal.depth == 4)
5053				{
5054					for(int y = 0; y < height; y++)
5055					{
5056						for(int x = 0; x < 2 * width; x++)
5057						{
5058							float c0 = *(float*)(source0 + 4 * x);
5059							float c1 = *(float*)(source1 + 4 * x);
5060							float c2 = *(float*)(source2 + 4 * x);
5061							float c3 = *(float*)(source3 + 4 * x);
5062
5063							c0 = c0 + c1;
5064							c2 = c2 + c3;
5065							c0 = c0 + c2;
5066							c0 *= 1.0f / 4.0f;
5067
5068							*(float*)(source0 + 4 * x) = c0;
5069						}
5070
5071						source0 += pitch;
5072						source1 += pitch;
5073						source2 += pitch;
5074						source3 += pitch;
5075					}
5076				}
5077				else if(internal.depth == 8)
5078				{
5079					for(int y = 0; y < height; y++)
5080					{
5081						for(int x = 0; x < 2 * width; x++)
5082						{
5083							float c0 = *(float*)(source0 + 4 * x);
5084							float c1 = *(float*)(source1 + 4 * x);
5085							float c2 = *(float*)(source2 + 4 * x);
5086							float c3 = *(float*)(source3 + 4 * x);
5087							float c4 = *(float*)(source4 + 4 * x);
5088							float c5 = *(float*)(source5 + 4 * x);
5089							float c6 = *(float*)(source6 + 4 * x);
5090							float c7 = *(float*)(source7 + 4 * x);
5091
5092							c0 = c0 + c1;
5093							c2 = c2 + c3;
5094							c4 = c4 + c5;
5095							c6 = c6 + c7;
5096							c0 = c0 + c2;
5097							c4 = c4 + c6;
5098							c0 = c0 + c4;
5099							c0 *= 1.0f / 8.0f;
5100
5101							*(float*)(source0 + 4 * x) = c0;
5102						}
5103
5104						source0 += pitch;
5105						source1 += pitch;
5106						source2 += pitch;
5107						source3 += pitch;
5108						source4 += pitch;
5109						source5 += pitch;
5110						source6 += pitch;
5111						source7 += pitch;
5112					}
5113				}
5114				else if(internal.depth == 16)
5115				{
5116					for(int y = 0; y < height; y++)
5117					{
5118						for(int x = 0; x < 2 * width; x++)
5119						{
5120							float c0 = *(float*)(source0 + 4 * x);
5121							float c1 = *(float*)(source1 + 4 * x);
5122							float c2 = *(float*)(source2 + 4 * x);
5123							float c3 = *(float*)(source3 + 4 * x);
5124							float c4 = *(float*)(source4 + 4 * x);
5125							float c5 = *(float*)(source5 + 4 * x);
5126							float c6 = *(float*)(source6 + 4 * x);
5127							float c7 = *(float*)(source7 + 4 * x);
5128							float c8 = *(float*)(source8 + 4 * x);
5129							float c9 = *(float*)(source9 + 4 * x);
5130							float cA = *(float*)(sourceA + 4 * x);
5131							float cB = *(float*)(sourceB + 4 * x);
5132							float cC = *(float*)(sourceC + 4 * x);
5133							float cD = *(float*)(sourceD + 4 * x);
5134							float cE = *(float*)(sourceE + 4 * x);
5135							float cF = *(float*)(sourceF + 4 * x);
5136
5137							c0 = c0 + c1;
5138							c2 = c2 + c3;
5139							c4 = c4 + c5;
5140							c6 = c6 + c7;
5141							c8 = c8 + c9;
5142							cA = cA + cB;
5143							cC = cC + cD;
5144							cE = cE + cF;
5145							c0 = c0 + c2;
5146							c4 = c4 + c6;
5147							c8 = c8 + cA;
5148							cC = cC + cE;
5149							c0 = c0 + c4;
5150							c8 = c8 + cC;
5151							c0 = c0 + c8;
5152							c0 *= 1.0f / 16.0f;
5153
5154							*(float*)(source0 + 4 * x) = c0;
5155						}
5156
5157						source0 += pitch;
5158						source1 += pitch;
5159						source2 += pitch;
5160						source3 += pitch;
5161						source4 += pitch;
5162						source5 += pitch;
5163						source6 += pitch;
5164						source7 += pitch;
5165						source8 += pitch;
5166						source9 += pitch;
5167						sourceA += pitch;
5168						sourceB += pitch;
5169						sourceC += pitch;
5170						sourceD += pitch;
5171						sourceE += pitch;
5172						sourceF += pitch;
5173					}
5174				}
5175				else ASSERT(false);
5176			}
5177		}
5178		else if(internal.format == FORMAT_A32B32G32R32F || internal.format == FORMAT_X32B32G32R32F)
5179		{
5180			if(CPUID::supportsSSE())
5181			{
5182				if(internal.depth == 2)
5183				{
5184					for(int y = 0; y < height; y++)
5185					{
5186						for(int x = 0; x < width; x++)
5187						{
5188							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5189							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5190
5191							c0 = _mm_add_ps(c0, c1);
5192							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
5193
5194							_mm_store_ps((float*)(source0 + 16 * x), c0);
5195						}
5196
5197						source0 += pitch;
5198						source1 += pitch;
5199					}
5200				}
5201				else if(internal.depth == 4)
5202				{
5203					for(int y = 0; y < height; y++)
5204					{
5205						for(int x = 0; x < width; x++)
5206						{
5207							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5208							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5209							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5210							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5211
5212							c0 = _mm_add_ps(c0, c1);
5213							c2 = _mm_add_ps(c2, c3);
5214							c0 = _mm_add_ps(c0, c2);
5215							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
5216
5217							_mm_store_ps((float*)(source0 + 16 * x), c0);
5218						}
5219
5220						source0 += pitch;
5221						source1 += pitch;
5222						source2 += pitch;
5223						source3 += pitch;
5224					}
5225				}
5226				else if(internal.depth == 8)
5227				{
5228					for(int y = 0; y < height; y++)
5229					{
5230						for(int x = 0; x < width; x++)
5231						{
5232							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5233							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5234							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5235							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5236							__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
5237							__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
5238							__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
5239							__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
5240
5241							c0 = _mm_add_ps(c0, c1);
5242							c2 = _mm_add_ps(c2, c3);
5243							c4 = _mm_add_ps(c4, c5);
5244							c6 = _mm_add_ps(c6, c7);
5245							c0 = _mm_add_ps(c0, c2);
5246							c4 = _mm_add_ps(c4, c6);
5247							c0 = _mm_add_ps(c0, c4);
5248							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
5249
5250							_mm_store_ps((float*)(source0 + 16 * x), c0);
5251						}
5252
5253						source0 += pitch;
5254						source1 += pitch;
5255						source2 += pitch;
5256						source3 += pitch;
5257						source4 += pitch;
5258						source5 += pitch;
5259						source6 += pitch;
5260						source7 += pitch;
5261					}
5262				}
5263				else if(internal.depth == 16)
5264				{
5265					for(int y = 0; y < height; y++)
5266					{
5267						for(int x = 0; x < width; x++)
5268						{
5269							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5270							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5271							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5272							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5273							__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
5274							__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
5275							__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
5276							__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
5277							__m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
5278							__m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
5279							__m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
5280							__m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
5281							__m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
5282							__m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
5283							__m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
5284							__m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
5285
5286							c0 = _mm_add_ps(c0, c1);
5287							c2 = _mm_add_ps(c2, c3);
5288							c4 = _mm_add_ps(c4, c5);
5289							c6 = _mm_add_ps(c6, c7);
5290							c8 = _mm_add_ps(c8, c9);
5291							cA = _mm_add_ps(cA, cB);
5292							cC = _mm_add_ps(cC, cD);
5293							cE = _mm_add_ps(cE, cF);
5294							c0 = _mm_add_ps(c0, c2);
5295							c4 = _mm_add_ps(c4, c6);
5296							c8 = _mm_add_ps(c8, cA);
5297							cC = _mm_add_ps(cC, cE);
5298							c0 = _mm_add_ps(c0, c4);
5299							c8 = _mm_add_ps(c8, cC);
5300							c0 = _mm_add_ps(c0, c8);
5301							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5302
5303							_mm_store_ps((float*)(source0 + 16 * x), c0);
5304						}
5305
5306						source0 += pitch;
5307						source1 += pitch;
5308						source2 += pitch;
5309						source3 += pitch;
5310						source4 += pitch;
5311						source5 += pitch;
5312						source6 += pitch;
5313						source7 += pitch;
5314						source8 += pitch;
5315						source9 += pitch;
5316						sourceA += pitch;
5317						sourceB += pitch;
5318						sourceC += pitch;
5319						sourceD += pitch;
5320						sourceE += pitch;
5321						sourceF += pitch;
5322					}
5323				}
5324				else ASSERT(false);
5325			}
5326			else
5327			{
5328				if(internal.depth == 2)
5329				{
5330					for(int y = 0; y < height; y++)
5331					{
5332						for(int x = 0; x < 4 * width; x++)
5333						{
5334							float c0 = *(float*)(source0 + 4 * x);
5335							float c1 = *(float*)(source1 + 4 * x);
5336
5337							c0 = c0 + c1;
5338							c0 *= 1.0f / 2.0f;
5339
5340							*(float*)(source0 + 4 * x) = c0;
5341						}
5342
5343						source0 += pitch;
5344						source1 += pitch;
5345					}
5346				}
5347				else if(internal.depth == 4)
5348				{
5349					for(int y = 0; y < height; y++)
5350					{
5351						for(int x = 0; x < 4 * width; x++)
5352						{
5353							float c0 = *(float*)(source0 + 4 * x);
5354							float c1 = *(float*)(source1 + 4 * x);
5355							float c2 = *(float*)(source2 + 4 * x);
5356							float c3 = *(float*)(source3 + 4 * x);
5357
5358							c0 = c0 + c1;
5359							c2 = c2 + c3;
5360							c0 = c0 + c2;
5361							c0 *= 1.0f / 4.0f;
5362
5363							*(float*)(source0 + 4 * x) = c0;
5364						}
5365
5366						source0 += pitch;
5367						source1 += pitch;
5368						source2 += pitch;
5369						source3 += pitch;
5370					}
5371				}
5372				else if(internal.depth == 8)
5373				{
5374					for(int y = 0; y < height; y++)
5375					{
5376						for(int x = 0; x < 4 * width; x++)
5377						{
5378							float c0 = *(float*)(source0 + 4 * x);
5379							float c1 = *(float*)(source1 + 4 * x);
5380							float c2 = *(float*)(source2 + 4 * x);
5381							float c3 = *(float*)(source3 + 4 * x);
5382							float c4 = *(float*)(source4 + 4 * x);
5383							float c5 = *(float*)(source5 + 4 * x);
5384							float c6 = *(float*)(source6 + 4 * x);
5385							float c7 = *(float*)(source7 + 4 * x);
5386
5387							c0 = c0 + c1;
5388							c2 = c2 + c3;
5389							c4 = c4 + c5;
5390							c6 = c6 + c7;
5391							c0 = c0 + c2;
5392							c4 = c4 + c6;
5393							c0 = c0 + c4;
5394							c0 *= 1.0f / 8.0f;
5395
5396							*(float*)(source0 + 4 * x) = c0;
5397						}
5398
5399						source0 += pitch;
5400						source1 += pitch;
5401						source2 += pitch;
5402						source3 += pitch;
5403						source4 += pitch;
5404						source5 += pitch;
5405						source6 += pitch;
5406						source7 += pitch;
5407					}
5408				}
5409				else if(internal.depth == 16)
5410				{
5411					for(int y = 0; y < height; y++)
5412					{
5413						for(int x = 0; x < 4 * width; x++)
5414						{
5415							float c0 = *(float*)(source0 + 4 * x);
5416							float c1 = *(float*)(source1 + 4 * x);
5417							float c2 = *(float*)(source2 + 4 * x);
5418							float c3 = *(float*)(source3 + 4 * x);
5419							float c4 = *(float*)(source4 + 4 * x);
5420							float c5 = *(float*)(source5 + 4 * x);
5421							float c6 = *(float*)(source6 + 4 * x);
5422							float c7 = *(float*)(source7 + 4 * x);
5423							float c8 = *(float*)(source8 + 4 * x);
5424							float c9 = *(float*)(source9 + 4 * x);
5425							float cA = *(float*)(sourceA + 4 * x);
5426							float cB = *(float*)(sourceB + 4 * x);
5427							float cC = *(float*)(sourceC + 4 * x);
5428							float cD = *(float*)(sourceD + 4 * x);
5429							float cE = *(float*)(sourceE + 4 * x);
5430							float cF = *(float*)(sourceF + 4 * x);
5431
5432							c0 = c0 + c1;
5433							c2 = c2 + c3;
5434							c4 = c4 + c5;
5435							c6 = c6 + c7;
5436							c8 = c8 + c9;
5437							cA = cA + cB;
5438							cC = cC + cD;
5439							cE = cE + cF;
5440							c0 = c0 + c2;
5441							c4 = c4 + c6;
5442							c8 = c8 + cA;
5443							cC = cC + cE;
5444							c0 = c0 + c4;
5445							c8 = c8 + cC;
5446							c0 = c0 + c8;
5447							c0 *= 1.0f / 16.0f;
5448
5449							*(float*)(source0 + 4 * x) = c0;
5450						}
5451
5452						source0 += pitch;
5453						source1 += pitch;
5454						source2 += pitch;
5455						source3 += pitch;
5456						source4 += pitch;
5457						source5 += pitch;
5458						source6 += pitch;
5459						source7 += pitch;
5460						source8 += pitch;
5461						source9 += pitch;
5462						sourceA += pitch;
5463						sourceB += pitch;
5464						sourceC += pitch;
5465						sourceD += pitch;
5466						sourceE += pitch;
5467						sourceF += pitch;
5468					}
5469				}
5470				else ASSERT(false);
5471			}
5472		}
5473		else if(internal.format == FORMAT_R5G6B5)
5474		{
5475			if(CPUID::supportsSSE2() && (width % 8) == 0)
5476			{
5477				if(internal.depth == 2)
5478				{
5479					for(int y = 0; y < height; y++)
5480					{
5481						for(int x = 0; x < width; x += 8)
5482						{
5483							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5484							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5485
5486							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5487							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5488							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5489							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5490							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5491							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5492
5493							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5494							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5495							c1 = _mm_avg_epu16(c0__g_, c1__g_);
5496							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5497							c0 = _mm_or_si128(c0, c1);
5498
5499							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5500						}
5501
5502						source0 += pitch;
5503						source1 += pitch;
5504					}
5505				}
5506				else if(internal.depth == 4)
5507				{
5508					for(int y = 0; y < height; y++)
5509					{
5510						for(int x = 0; x < width; x += 8)
5511						{
5512							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5513							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5514							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5515							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5516
5517							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5518							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5519							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5520							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5521							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5522							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5523							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5524							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5525							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5526							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5527
5528							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5529							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5530							c0 = _mm_avg_epu8(c0, c2);
5531							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5532							c1 = _mm_avg_epu16(c0__g_, c1__g_);
5533							c3 = _mm_avg_epu16(c2__g_, c3__g_);
5534							c1 = _mm_avg_epu16(c1, c3);
5535							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5536							c0 = _mm_or_si128(c0, c1);
5537
5538							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5539						}
5540
5541						source0 += pitch;
5542						source1 += pitch;
5543						source2 += pitch;
5544						source3 += pitch;
5545					}
5546				}
5547				else if(internal.depth == 8)
5548				{
5549					for(int y = 0; y < height; y++)
5550					{
5551						for(int x = 0; x < width; x += 8)
5552						{
5553							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5554							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5555							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5556							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5557							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
5558							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
5559							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
5560							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
5561
5562							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5563							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5564							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5565							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5566							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5567							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5568							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5569							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5570							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5571							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5572							__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
5573							__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
5574							__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
5575							__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
5576							__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
5577							__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
5578							__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
5579							__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
5580
5581							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5582							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5583							c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
5584							c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
5585							c0 = _mm_avg_epu8(c0, c2);
5586							c4 = _mm_avg_epu8(c4, c6);
5587							c0 = _mm_avg_epu8(c0, c4);
5588							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5589							c1 = _mm_avg_epu16(c0__g_, c1__g_);
5590							c3 = _mm_avg_epu16(c2__g_, c3__g_);
5591							c5 = _mm_avg_epu16(c4__g_, c5__g_);
5592							c7 = _mm_avg_epu16(c6__g_, c7__g_);
5593							c1 = _mm_avg_epu16(c1, c3);
5594							c5 = _mm_avg_epu16(c5, c7);
5595							c1 = _mm_avg_epu16(c1, c5);
5596							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5597							c0 = _mm_or_si128(c0, c1);
5598
5599							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5600						}
5601
5602						source0 += pitch;
5603						source1 += pitch;
5604						source2 += pitch;
5605						source3 += pitch;
5606						source4 += pitch;
5607						source5 += pitch;
5608						source6 += pitch;
5609						source7 += pitch;
5610					}
5611				}
5612				else if(internal.depth == 16)
5613				{
5614					for(int y = 0; y < height; y++)
5615					{
5616						for(int x = 0; x < width; x += 8)
5617						{
5618							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5619							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5620							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5621							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5622							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
5623							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
5624							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
5625							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
5626							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 2 * x));
5627							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 2 * x));
5628							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 2 * x));
5629							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 2 * x));
5630							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 2 * x));
5631							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 2 * x));
5632							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 2 * x));
5633							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 2 * x));
5634
5635							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5636							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5637							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5638							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5639							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5640							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5641							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5642							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5643							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5644							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5645							__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
5646							__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
5647							__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
5648							__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
5649							__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
5650							__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
5651							__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
5652							__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
5653							__m128i c8_r_b = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(r_b));
5654							__m128i c8__g_ = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(_g_));
5655							__m128i c9_r_b = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(r_b));
5656							__m128i c9__g_ = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(_g_));
5657							__m128i cA_r_b = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(r_b));
5658							__m128i cA__g_ = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(_g_));
5659							__m128i cB_r_b = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(r_b));
5660							__m128i cB__g_ = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(_g_));
5661							__m128i cC_r_b = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(r_b));
5662							__m128i cC__g_ = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(_g_));
5663							__m128i cD_r_b = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(r_b));
5664							__m128i cD__g_ = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(_g_));
5665							__m128i cE_r_b = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(r_b));
5666							__m128i cE__g_ = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(_g_));
5667							__m128i cF_r_b = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(r_b));
5668							__m128i cF__g_ = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(_g_));
5669
5670							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5671							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5672							c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
5673							c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
5674							c8 = _mm_avg_epu8(c8_r_b, c9_r_b);
5675							cA = _mm_avg_epu8(cA_r_b, cB_r_b);
5676							cC = _mm_avg_epu8(cC_r_b, cD_r_b);
5677							cE = _mm_avg_epu8(cE_r_b, cF_r_b);
5678							c0 = _mm_avg_epu8(c0, c2);
5679							c4 = _mm_avg_epu8(c4, c6);
5680							c8 = _mm_avg_epu8(c8, cA);
5681							cC = _mm_avg_epu8(cC, cE);
5682							c0 = _mm_avg_epu8(c0, c4);
5683							c8 = _mm_avg_epu8(c8, cC);
5684							c0 = _mm_avg_epu8(c0, c8);
5685							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5686							c1 = _mm_avg_epu16(c0__g_, c1__g_);
5687							c3 = _mm_avg_epu16(c2__g_, c3__g_);
5688							c5 = _mm_avg_epu16(c4__g_, c5__g_);
5689							c7 = _mm_avg_epu16(c6__g_, c7__g_);
5690							c9 = _mm_avg_epu16(c8__g_, c9__g_);
5691							cB = _mm_avg_epu16(cA__g_, cB__g_);
5692							cD = _mm_avg_epu16(cC__g_, cD__g_);
5693							cF = _mm_avg_epu16(cE__g_, cF__g_);
5694							c1 = _mm_avg_epu8(c1, c3);
5695							c5 = _mm_avg_epu8(c5, c7);
5696							c9 = _mm_avg_epu8(c9, cB);
5697							cD = _mm_avg_epu8(cD, cF);
5698							c1 = _mm_avg_epu8(c1, c5);
5699							c9 = _mm_avg_epu8(c9, cD);
5700							c1 = _mm_avg_epu8(c1, c9);
5701							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5702							c0 = _mm_or_si128(c0, c1);
5703
5704							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5705						}
5706
5707						source0 += pitch;
5708						source1 += pitch;
5709						source2 += pitch;
5710						source3 += pitch;
5711						source4 += pitch;
5712						source5 += pitch;
5713						source6 += pitch;
5714						source7 += pitch;
5715						source8 += pitch;
5716						source9 += pitch;
5717						sourceA += pitch;
5718						sourceB += pitch;
5719						sourceC += pitch;
5720						sourceD += pitch;
5721						sourceE += pitch;
5722						sourceF += pitch;
5723					}
5724				}
5725				else ASSERT(false);
5726			}
5727			else
5728			{
5729				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7BEF) + (((x) ^ (y)) & 0x0821))
5730
5731				if(internal.depth == 2)
5732				{
5733					for(int y = 0; y < height; y++)
5734					{
5735						for(int x = 0; x < width; x++)
5736						{
5737							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5738							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5739
5740							c0 = AVERAGE(c0, c1);
5741
5742							*(unsigned short*)(source0 + 2 * x) = c0;
5743						}
5744
5745						source0 += pitch;
5746						source1 += pitch;
5747					}
5748				}
5749				else if(internal.depth == 4)
5750				{
5751					for(int y = 0; y < height; y++)
5752					{
5753						for(int x = 0; x < width; x++)
5754						{
5755							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5756							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5757							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
5758							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
5759
5760							c0 = AVERAGE(c0, c1);
5761							c2 = AVERAGE(c2, c3);
5762							c0 = AVERAGE(c0, c2);
5763
5764							*(unsigned short*)(source0 + 2 * x) = c0;
5765						}
5766
5767						source0 += pitch;
5768						source1 += pitch;
5769						source2 += pitch;
5770						source3 += pitch;
5771					}
5772				}
5773				else if(internal.depth == 8)
5774				{
5775					for(int y = 0; y < height; y++)
5776					{
5777						for(int x = 0; x < width; x++)
5778						{
5779							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5780							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5781							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
5782							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
5783							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
5784							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
5785							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
5786							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
5787
5788							c0 = AVERAGE(c0, c1);
5789							c2 = AVERAGE(c2, c3);
5790							c4 = AVERAGE(c4, c5);
5791							c6 = AVERAGE(c6, c7);
5792							c0 = AVERAGE(c0, c2);
5793							c4 = AVERAGE(c4, c6);
5794							c0 = AVERAGE(c0, c4);
5795
5796							*(unsigned short*)(source0 + 2 * x) = c0;
5797						}
5798
5799						source0 += pitch;
5800						source1 += pitch;
5801						source2 += pitch;
5802						source3 += pitch;
5803						source4 += pitch;
5804						source5 += pitch;
5805						source6 += pitch;
5806						source7 += pitch;
5807					}
5808				}
5809				else if(internal.depth == 16)
5810				{
5811					for(int y = 0; y < height; y++)
5812					{
5813						for(int x = 0; x < width; x++)
5814						{
5815							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5816							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5817							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
5818							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
5819							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
5820							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
5821							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
5822							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
5823							unsigned short c8 = *(unsigned short*)(source8 + 2 * x);
5824							unsigned short c9 = *(unsigned short*)(source9 + 2 * x);
5825							unsigned short cA = *(unsigned short*)(sourceA + 2 * x);
5826							unsigned short cB = *(unsigned short*)(sourceB + 2 * x);
5827							unsigned short cC = *(unsigned short*)(sourceC + 2 * x);
5828							unsigned short cD = *(unsigned short*)(sourceD + 2 * x);
5829							unsigned short cE = *(unsigned short*)(sourceE + 2 * x);
5830							unsigned short cF = *(unsigned short*)(sourceF + 2 * x);
5831
5832							c0 = AVERAGE(c0, c1);
5833							c2 = AVERAGE(c2, c3);
5834							c4 = AVERAGE(c4, c5);
5835							c6 = AVERAGE(c6, c7);
5836							c8 = AVERAGE(c8, c9);
5837							cA = AVERAGE(cA, cB);
5838							cC = AVERAGE(cC, cD);
5839							cE = AVERAGE(cE, cF);
5840							c0 = AVERAGE(c0, c2);
5841							c4 = AVERAGE(c4, c6);
5842							c8 = AVERAGE(c8, cA);
5843							cC = AVERAGE(cC, cE);
5844							c0 = AVERAGE(c0, c4);
5845							c8 = AVERAGE(c8, cC);
5846							c0 = AVERAGE(c0, c8);
5847
5848							*(unsigned short*)(source0 + 2 * x) = c0;
5849						}
5850
5851						source0 += pitch;
5852						source1 += pitch;
5853						source2 += pitch;
5854						source3 += pitch;
5855						source4 += pitch;
5856						source5 += pitch;
5857						source6 += pitch;
5858						source7 += pitch;
5859						source8 += pitch;
5860						source9 += pitch;
5861						sourceA += pitch;
5862						sourceB += pitch;
5863						sourceC += pitch;
5864						sourceD += pitch;
5865						sourceE += pitch;
5866						sourceF += pitch;
5867					}
5868				}
5869				else ASSERT(false);
5870
5871				#undef AVERAGE
5872			}
5873		}
5874		else
5875		{
5876		//	UNIMPLEMENTED();
5877		}
5878	}
5879}
5880