Surface.cpp revision 43577b8cc676a157ceab055ead33a441c23b2cf5
1// SwiftShader Software Renderer
2//
3// Copyright(c) 2005-2013 TransGaming Inc.
4//
5// All rights reserved. No part of this software may be copied, distributed, transmitted,
6// transcribed, stored in a retrieval system, translated into any human or computer
7// language by any means, or disclosed to third parties without the explicit written
8// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
9// or implied, including but not limited to any patent rights, are granted to you.
10//
11
12#include "Surface.hpp"
13
14#include "Color.hpp"
15#include "Context.hpp"
16#include "ETC_Decoder.hpp"
17#include "Renderer.hpp"
18#include "Common/Half.hpp"
19#include "Common/Memory.hpp"
20#include "Common/CPUID.hpp"
21#include "Common/Resource.hpp"
22#include "Common/Debug.hpp"
23#include "Reactor/Reactor.hpp"
24
25#include <xmmintrin.h>
26#include <emmintrin.h>
27
28#undef min
29#undef max
30
31namespace sw
32{
33	extern bool quadLayoutEnabled;
34	extern bool complementaryDepthBuffer;
35	extern TranscendentalPrecision logPrecision;
36
37	unsigned int *Surface::palette = 0;
38	unsigned int Surface::paletteID = 0;
39
40	void Rect::clip(int minX, int minY, int maxX, int maxY)
41	{
42		x0 = clamp(x0, minX, maxX);
43		y0 = clamp(y0, minY, maxY);
44		x1 = clamp(x1, minX, maxX);
45		y1 = clamp(y1, minY, maxY);
46	}
47
48	void Surface::Buffer::write(int x, int y, int z, const Color<float> &color)
49	{
50		void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
51
52		write(element, color);
53	}
54
55	void Surface::Buffer::write(int x, int y, const Color<float> &color)
56	{
57		void *element = (unsigned char*)buffer + x * bytes + y * pitchB;
58
59		write(element, color);
60	}
61
62	inline void Surface::Buffer::write(void *element, const Color<float> &color)
63	{
64		switch(format)
65		{
66		case FORMAT_A8:
67			*(unsigned char*)element = unorm<8>(color.a);
68			break;
69		case FORMAT_R8:
70			*(unsigned char*)element = unorm<8>(color.r);
71			break;
72		case FORMAT_R3G3B2:
73			*(unsigned char*)element = (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
74			break;
75		case FORMAT_A8R3G3B2:
76			*(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
77			break;
78		case FORMAT_X4R4G4B4:
79			*(unsigned short*)element = 0xF000 | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
80			break;
81		case FORMAT_A4R4G4B4:
82			*(unsigned short*)element = (unorm<4>(color.a) << 12) | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
83			break;
84		case FORMAT_R4G4B4A4:
85			*(unsigned short*)element = (unorm<4>(color.r) << 12) | (unorm<4>(color.g) << 8) | (unorm<4>(color.b) << 4) | (unorm<4>(color.a) << 0);
86			break;
87		case FORMAT_R5G6B5:
88			*(unsigned short*)element = (unorm<5>(color.r) << 11) | (unorm<6>(color.g) << 5) | (unorm<5>(color.b) << 0);
89			break;
90		case FORMAT_A1R5G5B5:
91			*(unsigned short*)element = (unorm<1>(color.a) << 15) | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
92			break;
93		case FORMAT_R5G5B5A1:
94			*(unsigned short*)element = (unorm<5>(color.r) << 11) | (unorm<5>(color.g) << 6) | (unorm<5>(color.b) << 1) | (unorm<5>(color.a) << 0);
95			break;
96		case FORMAT_X1R5G5B5:
97			*(unsigned short*)element = 0x8000 | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
98			break;
99		case FORMAT_A8R8G8B8:
100			*(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
101			break;
102		case FORMAT_X8R8G8B8:
103			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
104			break;
105		case FORMAT_A8B8G8R8:
106			*(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
107			break;
108		case FORMAT_X8B8G8R8:
109			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
110			break;
111		case FORMAT_A2R10G10B10:
112			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.r) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.b) << 0);
113			break;
114		case FORMAT_A2B10G10R10:
115			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.b) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.r) << 0);
116			break;
117		case FORMAT_G8R8:
118			*(unsigned int*)element = (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
119			break;
120		case FORMAT_G16R16:
121			*(unsigned int*)element = (unorm<16>(color.g) << 16) | (unorm<16>(color.r) << 0);
122			break;
123		case FORMAT_A16B16G16R16:
124			((unsigned short*)element)[0] = unorm<16>(color.r);
125			((unsigned short*)element)[1] = unorm<16>(color.g);
126			((unsigned short*)element)[2] = unorm<16>(color.b);
127			((unsigned short*)element)[3] = unorm<16>(color.a);
128			break;
129		case FORMAT_V8U8:
130			*(unsigned short*)element = (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
131			break;
132		case FORMAT_L6V5U5:
133			*(unsigned short*)element = (unorm<6>(color.b) << 10) | (snorm<5>(color.g) << 5) | (snorm<5>(color.r) << 0);
134			break;
135		case FORMAT_Q8W8V8U8:
136			*(unsigned int*)element = (snorm<8>(color.a) << 24) | (snorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
137			break;
138		case FORMAT_X8L8V8U8:
139			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
140			break;
141		case FORMAT_V16U16:
142			*(unsigned int*)element = (snorm<16>(color.g) << 16) | (snorm<16>(color.r) << 0);
143			break;
144		case FORMAT_A2W10V10U10:
145			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (snorm<10>(color.b) << 20) | (snorm<10>(color.g) << 10) | (snorm<10>(color.r) << 0);
146			break;
147		case FORMAT_A16W16V16U16:
148			((unsigned short*)element)[0] = snorm<16>(color.r);
149			((unsigned short*)element)[1] = snorm<16>(color.g);
150			((unsigned short*)element)[2] = snorm<16>(color.b);
151			((unsigned short*)element)[3] = unorm<16>(color.a);
152			break;
153		case FORMAT_Q16W16V16U16:
154			((unsigned short*)element)[0] = snorm<16>(color.r);
155			((unsigned short*)element)[1] = snorm<16>(color.g);
156			((unsigned short*)element)[2] = snorm<16>(color.b);
157			((unsigned short*)element)[3] = snorm<16>(color.a);
158			break;
159		case FORMAT_R8G8B8:
160			((unsigned char*)element)[0] = unorm<8>(color.b);
161			((unsigned char*)element)[1] = unorm<8>(color.g);
162			((unsigned char*)element)[2] = unorm<8>(color.r);
163			break;
164		case FORMAT_B8G8R8:
165			((unsigned char*)element)[0] = unorm<8>(color.r);
166			((unsigned char*)element)[1] = unorm<8>(color.g);
167			((unsigned char*)element)[2] = unorm<8>(color.b);
168			break;
169		case FORMAT_R16F:
170			*(half*)element = (half)color.r;
171			break;
172		case FORMAT_A16F:
173			*(half*)element = (half)color.a;
174			break;
175		case FORMAT_G16R16F:
176			((half*)element)[0] = (half)color.r;
177			((half*)element)[1] = (half)color.g;
178			break;
179		case FORMAT_B16G16R16F:
180			((half*)element)[0] = (half)color.r;
181			((half*)element)[1] = (half)color.g;
182			((half*)element)[2] = (half)color.b;
183			break;
184		case FORMAT_A16B16G16R16F:
185			((half*)element)[0] = (half)color.r;
186			((half*)element)[1] = (half)color.g;
187			((half*)element)[2] = (half)color.b;
188			((half*)element)[3] = (half)color.a;
189			break;
190		case FORMAT_A32F:
191			*(float*)element = color.a;
192			break;
193		case FORMAT_R32F:
194			*(float*)element = color.r;
195			break;
196		case FORMAT_G32R32F:
197			((float*)element)[0] = color.r;
198			((float*)element)[1] = color.g;
199			break;
200		case FORMAT_B32G32R32F:
201			((float*)element)[0] = color.r;
202			((float*)element)[1] = color.g;
203			((float*)element)[2] = color.b;
204			break;
205		case FORMAT_A32B32G32R32F:
206			((float*)element)[0] = color.r;
207			((float*)element)[1] = color.g;
208			((float*)element)[2] = color.b;
209			((float*)element)[3] = color.a;
210			break;
211		case FORMAT_D32F:
212		case FORMAT_D32F_LOCKABLE:
213		case FORMAT_D32FS8_TEXTURE:
214		case FORMAT_D32FS8_SHADOW:
215			*((float*)element) = color.r;
216			break;
217		case FORMAT_D32F_COMPLEMENTARY:
218			*((float*)element) = 1 - color.r;
219			break;
220		case FORMAT_S8:
221			*((unsigned char*)element) = unorm<8>(color.r);
222			break;
223		case FORMAT_L8:
224			*(unsigned char*)element = unorm<8>(color.r);
225			break;
226		case FORMAT_A4L4:
227			*(unsigned char*)element = (unorm<4>(color.a) << 4) | (unorm<4>(color.r) << 0);
228			break;
229		case FORMAT_L16:
230			*(unsigned short*)element = unorm<16>(color.r);
231			break;
232		case FORMAT_A8L8:
233			*(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<8>(color.r) << 0);
234			break;
235		case FORMAT_L16F:
236			*(half*)element = (half)color.r;
237			break;
238		case FORMAT_A16L16F:
239			((half*)element)[0] = (half)color.r;
240			((half*)element)[1] = (half)color.a;
241			break;
242		case FORMAT_L32F:
243			*(float*)element = color.r;
244			break;
245		case FORMAT_A32L32F:
246			((float*)element)[0] = color.r;
247			((float*)element)[1] = color.a;
248			break;
249		default:
250			ASSERT(false);
251		}
252	}
253
254	Color<float> Surface::Buffer::read(int x, int y, int z) const
255	{
256		void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
257
258		return read(element);
259	}
260
261	Color<float> Surface::Buffer::read(int x, int y) const
262	{
263		void *element = (unsigned char*)buffer + x * bytes + y * pitchB;
264
265		return read(element);
266	}
267
268	inline Color<float> Surface::Buffer::read(void *element) const
269	{
270		float r = 0.0f;
271		float g = 0.0f;
272		float b = 0.0f;
273		float a = 1.0f;
274
275		switch(format)
276		{
277		case FORMAT_P8:
278			{
279				ASSERT(palette);
280
281				unsigned int abgr = palette[*(unsigned char*)element];
282
283				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
284				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
285				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
286				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
287			}
288			break;
289		case FORMAT_A8P8:
290			{
291				ASSERT(palette);
292
293				unsigned int bgr = palette[((unsigned char*)element)[0]];
294
295				r = (bgr & 0x000000FF) * (1.0f / 0x000000FF);
296				g = (bgr & 0x0000FF00) * (1.0f / 0x0000FF00);
297				b = (bgr & 0x00FF0000) * (1.0f / 0x00FF0000);
298				a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
299			}
300			break;
301		case FORMAT_A8:
302			r = 0;
303			g = 0;
304			b = 0;
305			a = *(unsigned char*)element * (1.0f / 0xFF);
306			break;
307		case FORMAT_R8:
308			r = *(unsigned char*)element * (1.0f / 0xFF);
309			break;
310		case FORMAT_R3G3B2:
311			{
312				unsigned char rgb = *(unsigned char*)element;
313
314				r = (rgb & 0xE0) * (1.0f / 0xE0);
315				g = (rgb & 0x1C) * (1.0f / 0x1C);
316				b = (rgb & 0x03) * (1.0f / 0x03);
317			}
318			break;
319		case FORMAT_A8R3G3B2:
320			{
321				unsigned short argb = *(unsigned short*)element;
322
323				a = (argb & 0xFF00) * (1.0f / 0xFF00);
324				r = (argb & 0x00E0) * (1.0f / 0x00E0);
325				g = (argb & 0x001C) * (1.0f / 0x001C);
326				b = (argb & 0x0003) * (1.0f / 0x0003);
327			}
328			break;
329		case FORMAT_X4R4G4B4:
330			{
331				unsigned short rgb = *(unsigned short*)element;
332
333				r = (rgb & 0x0F00) * (1.0f / 0x0F00);
334				g = (rgb & 0x00F0) * (1.0f / 0x00F0);
335				b = (rgb & 0x000F) * (1.0f / 0x000F);
336			}
337			break;
338		case FORMAT_A4R4G4B4:
339			{
340				unsigned short argb = *(unsigned short*)element;
341
342				a = (argb & 0xF000) * (1.0f / 0xF000);
343				r = (argb & 0x0F00) * (1.0f / 0x0F00);
344				g = (argb & 0x00F0) * (1.0f / 0x00F0);
345				b = (argb & 0x000F) * (1.0f / 0x000F);
346			}
347			break;
348		case FORMAT_R4G4B4A4:
349			{
350				unsigned short rgba = *(unsigned short*)element;
351
352				r = (rgba & 0xF000) * (1.0f / 0xF000);
353				g = (rgba & 0x0F00) * (1.0f / 0x0F00);
354				b = (rgba & 0x00F0) * (1.0f / 0x00F0);
355				a = (rgba & 0x000F) * (1.0f / 0x000F);
356			}
357			break;
358		case FORMAT_R5G6B5:
359			{
360				unsigned short rgb = *(unsigned short*)element;
361
362				r = (rgb & 0xF800) * (1.0f / 0xF800);
363				g = (rgb & 0x07E0) * (1.0f / 0x07E0);
364				b = (rgb & 0x001F) * (1.0f / 0x001F);
365			}
366			break;
367		case FORMAT_A1R5G5B5:
368			{
369				unsigned short argb = *(unsigned short*)element;
370
371				a = (argb & 0x8000) * (1.0f / 0x8000);
372				r = (argb & 0x7C00) * (1.0f / 0x7C00);
373				g = (argb & 0x03E0) * (1.0f / 0x03E0);
374				b = (argb & 0x001F) * (1.0f / 0x001F);
375			}
376			break;
377		case FORMAT_R5G5B5A1:
378			{
379				unsigned short rgba = *(unsigned short*)element;
380
381				r = (rgba & 0xF800) * (1.0f / 0xF800);
382				g = (rgba & 0x07C0) * (1.0f / 0x07C0);
383				b = (rgba & 0x003E) * (1.0f / 0x003E);
384				a = (rgba & 0x0001) * (1.0f / 0x0001);
385			}
386			break;
387		case FORMAT_X1R5G5B5:
388			{
389				unsigned short xrgb = *(unsigned short*)element;
390
391				r = (xrgb & 0x7C00) * (1.0f / 0x7C00);
392				g = (xrgb & 0x03E0) * (1.0f / 0x03E0);
393				b = (xrgb & 0x001F) * (1.0f / 0x001F);
394			}
395			break;
396		case FORMAT_A8R8G8B8:
397			{
398				unsigned int argb = *(unsigned int*)element;
399
400				a = (argb & 0xFF000000) * (1.0f / 0xFF000000);
401				r = (argb & 0x00FF0000) * (1.0f / 0x00FF0000);
402				g = (argb & 0x0000FF00) * (1.0f / 0x0000FF00);
403				b = (argb & 0x000000FF) * (1.0f / 0x000000FF);
404			}
405			break;
406		case FORMAT_X8R8G8B8:
407			{
408				unsigned int xrgb = *(unsigned int*)element;
409
410				r = (xrgb & 0x00FF0000) * (1.0f / 0x00FF0000);
411				g = (xrgb & 0x0000FF00) * (1.0f / 0x0000FF00);
412				b = (xrgb & 0x000000FF) * (1.0f / 0x000000FF);
413			}
414			break;
415		case FORMAT_A8B8G8R8:
416			{
417				unsigned int abgr = *(unsigned int*)element;
418
419				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
420				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
421				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
422				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
423			}
424			break;
425		case FORMAT_X8B8G8R8:
426			{
427				unsigned int xbgr = *(unsigned int*)element;
428
429				b = (xbgr & 0x00FF0000) * (1.0f / 0x00FF0000);
430				g = (xbgr & 0x0000FF00) * (1.0f / 0x0000FF00);
431				r = (xbgr & 0x000000FF) * (1.0f / 0x000000FF);
432			}
433			break;
434		case FORMAT_G8R8:
435			{
436				unsigned short gr = *(unsigned short*)element;
437
438				g = (gr & 0xFF00) * (1.0f / 0xFF00);
439				r = (gr & 0x00FF) * (1.0f / 0x00FF);
440			}
441			break;
442		case FORMAT_G16R16:
443			{
444				unsigned int gr = *(unsigned int*)element;
445
446				g = (gr & 0xFFFF0000) * (1.0f / 0xFFFF0000);
447				r = (gr & 0x0000FFFF) * (1.0f / 0x0000FFFF);
448			}
449			break;
450		case FORMAT_A2R10G10B10:
451			{
452				unsigned int argb = *(unsigned int*)element;
453
454				a = (argb & 0xC0000000) * (1.0f / 0xC0000000);
455				r = (argb & 0x3FF00000) * (1.0f / 0x3FF00000);
456				g = (argb & 0x000FFC00) * (1.0f / 0x000FFC00);
457				b = (argb & 0x000003FF) * (1.0f / 0x000003FF);
458			}
459			break;
460		case FORMAT_A2B10G10R10:
461			{
462				unsigned int abgr = *(unsigned int*)element;
463
464				a = (abgr & 0xC0000000) * (1.0f / 0xC0000000);
465				b = (abgr & 0x3FF00000) * (1.0f / 0x3FF00000);
466				g = (abgr & 0x000FFC00) * (1.0f / 0x000FFC00);
467				r = (abgr & 0x000003FF) * (1.0f / 0x000003FF);
468			}
469			break;
470		case FORMAT_A16B16G16R16:
471			r = ((unsigned short*)element)[0] * (1.0f / 0xFFFF);
472			g = ((unsigned short*)element)[1] * (1.0f / 0xFFFF);
473			b = ((unsigned short*)element)[2] * (1.0f / 0xFFFF);
474			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
475			break;
476		case FORMAT_V8U8:
477			{
478				unsigned short vu = *(unsigned short*)element;
479
480				r = ((int)(vu & 0x00FF) << 24) * (1.0f / 0x7F000000);
481				g = ((int)(vu & 0xFF00) << 16) * (1.0f / 0x7F000000);
482			}
483			break;
484		case FORMAT_L6V5U5:
485			{
486				unsigned short lvu = *(unsigned short*)element;
487
488				r = ((int)(lvu & 0x001F) << 27) * (1.0f / 0x78000000);
489				g = ((int)(lvu & 0x03E0) << 22) * (1.0f / 0x78000000);
490				b = (lvu & 0xFC00) * (1.0f / 0xFC00);
491			}
492			break;
493		case FORMAT_Q8W8V8U8:
494			{
495				unsigned int qwvu = *(unsigned int*)element;
496
497				r = ((int)(qwvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
498				g = ((int)(qwvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
499				b = ((int)(qwvu & 0x00FF0000) << 8)  * (1.0f / 0x7F000000);
500				a = ((int)(qwvu & 0xFF000000) << 0)  * (1.0f / 0x7F000000);
501			}
502			break;
503		case FORMAT_X8L8V8U8:
504			{
505				unsigned int xlvu = *(unsigned int*)element;
506
507				r = ((int)(xlvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
508				g = ((int)(xlvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
509				b = (xlvu & 0x00FF0000) * (1.0f / 0x00FF0000);
510			}
511			break;
512		case FORMAT_R8G8B8:
513			r = ((unsigned char*)element)[2] * (1.0f / 0xFF);
514			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
515			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
516			break;
517		case FORMAT_B8G8R8:
518			r = ((unsigned char*)element)[0] * (1.0f / 0xFF);
519			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
520			b = ((unsigned char*)element)[2] * (1.0f / 0xFF);
521			break;
522		case FORMAT_V16U16:
523			{
524				unsigned int vu = *(unsigned int*)element;
525
526				r = ((int)(vu & 0x0000FFFF) << 16) * (1.0f / 0x7FFF0000);
527				g = ((int)(vu & 0xFFFF0000) << 0)  * (1.0f / 0x7FFF0000);
528			}
529			break;
530		case FORMAT_A2W10V10U10:
531			{
532				unsigned int awvu = *(unsigned int*)element;
533
534				r = ((int)(awvu & 0x000003FF) << 22) * (1.0f / 0x7FC00000);
535				g = ((int)(awvu & 0x000FFC00) << 12) * (1.0f / 0x7FC00000);
536				b = ((int)(awvu & 0x3FF00000) << 2)  * (1.0f / 0x7FC00000);
537				a = (awvu & 0xC0000000) * (1.0f / 0xC0000000);
538			}
539			break;
540		case FORMAT_A16W16V16U16:
541			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
542			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
543			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
544			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
545			break;
546		case FORMAT_Q16W16V16U16:
547			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
548			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
549			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
550			a = ((signed short*)element)[3] * (1.0f / 0x7FFF);
551			break;
552		case FORMAT_L8:
553			r =
554			g =
555			b = *(unsigned char*)element * (1.0f / 0xFF);
556			break;
557		case FORMAT_A4L4:
558			{
559				unsigned char al = *(unsigned char*)element;
560
561				r =
562				g =
563				b = (al & 0x0F) * (1.0f / 0x0F);
564				a = (al & 0xF0) * (1.0f / 0xF0);
565			}
566			break;
567		case FORMAT_L16:
568			r =
569			g =
570			b = *(unsigned short*)element * (1.0f / 0xFFFF);
571			break;
572		case FORMAT_A8L8:
573			r =
574			g =
575			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
576			a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
577			break;
578		case FORMAT_L16F:
579			r =
580			g =
581			b = *(half*)element;
582			break;
583		case FORMAT_A16L16F:
584			r =
585			g =
586			b = ((half*)element)[0];
587			a = ((half*)element)[1];
588			break;
589		case FORMAT_L32F:
590			r =
591			g =
592			b = *(float*)element;
593			break;
594		case FORMAT_A32L32F:
595			r =
596			g =
597			b = ((float*)element)[0];
598			a = ((float*)element)[1];
599			break;
600		case FORMAT_A16F:
601			a = *(half*)element;
602			break;
603		case FORMAT_R16F:
604			r = *(half*)element;
605			break;
606		case FORMAT_G16R16F:
607			r = ((half*)element)[0];
608			g = ((half*)element)[1];
609			break;
610		case FORMAT_B16G16R16F:
611			r = ((half*)element)[0];
612			g = ((half*)element)[1];
613			b = ((half*)element)[2];
614			break;
615		case FORMAT_A16B16G16R16F:
616			r = ((half*)element)[0];
617			g = ((half*)element)[1];
618			b = ((half*)element)[2];
619			a = ((half*)element)[3];
620			break;
621		case FORMAT_A32F:
622			a = *(float*)element;
623			break;
624		case FORMAT_R32F:
625			r = *(float*)element;
626			break;
627		case FORMAT_G32R32F:
628			r = ((float*)element)[0];
629			g = ((float*)element)[1];
630			break;
631		case FORMAT_B32G32R32F:
632			r = ((float*)element)[0];
633			g = ((float*)element)[1];
634			b = ((float*)element)[2];
635			break;
636		case FORMAT_A32B32G32R32F:
637			r = ((float*)element)[0];
638			g = ((float*)element)[1];
639			b = ((float*)element)[2];
640			a = ((float*)element)[3];
641			break;
642		case FORMAT_D32F:
643		case FORMAT_D32F_LOCKABLE:
644		case FORMAT_D32FS8_TEXTURE:
645		case FORMAT_D32FS8_SHADOW:
646			r = *(float*)element;
647			g = r;
648			b = r;
649			a = r;
650			break;
651		case FORMAT_D32F_COMPLEMENTARY:
652			r = 1.0f - *(float*)element;
653			g = r;
654			b = r;
655			a = r;
656			break;
657		case FORMAT_S8:
658			r = *(unsigned char*)element * (1.0f / 0xFF);
659			break;
660		default:
661			ASSERT(false);
662		}
663
664	//	if(sRGB)
665	//	{
666	//		r = sRGBtoLinear(r);
667	//		g = sRGBtoLinear(g);
668	//		b = sRGBtoLinear(b);
669	//	}
670
671		return Color<float>(r, g, b, a);
672	}
673
674	Color<float> Surface::Buffer::sample(float x, float y, float z) const
675	{
676		x -= 0.5f;
677		y -= 0.5f;
678		z -= 0.5f;
679
680		int x0 = clamp((int)x, 0, width - 1);
681		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
682
683		int y0 = clamp((int)y, 0, height - 1);
684		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
685
686		int z0 = clamp((int)z, 0, depth - 1);
687		int z1 = (z0 + 1 >= depth) ? z0 : z0 + 1;
688
689		Color<float> c000 = read(x0, y0, z0);
690		Color<float> c100 = read(x1, y0, z0);
691		Color<float> c010 = read(x0, y1, z0);
692		Color<float> c110 = read(x1, y1, z0);
693		Color<float> c001 = read(x0, y0, z1);
694		Color<float> c101 = read(x1, y0, z1);
695		Color<float> c011 = read(x0, y1, z1);
696		Color<float> c111 = read(x1, y1, z1);
697
698		float fx = x - x0;
699		float fy = y - y0;
700		float fz = z - z0;
701
702		c000 *= (1 - fx) * (1 - fy) * (1 - fz);
703		c100 *= fx * (1 - fy) * (1 - fz);
704		c010 *= (1 - fx) * fy * (1 - fz);
705		c110 *= fx * fy * (1 - fz);
706		c001 *= (1 - fx) * (1 - fy) * fz;
707		c101 *= fx * (1 - fy) * fz;
708		c011 *= (1 - fx) * fy * fz;
709		c111 *= fx * fy * fz;
710
711		return c000 + c100 + c010 + c110 + c001 + c101 + c011 + c111;
712	}
713
714	Color<float> Surface::Buffer::sample(float x, float y) const
715	{
716		x -= 0.5f;
717		y -= 0.5f;
718
719		int x0 = clamp((int)x, 0, width - 1);
720		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
721
722		int y0 = clamp((int)y, 0, height - 1);
723		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
724
725		Color<float> c00 = read(x0, y0);
726		Color<float> c10 = read(x1, y0);
727		Color<float> c01 = read(x0, y1);
728		Color<float> c11 = read(x1, y1);
729
730		float fx = x - x0;
731		float fy = y - y0;
732
733		c00 *= (1 - fx) * (1 - fy);
734		c10 *= fx * (1 - fy);
735		c01 *= (1 - fx) * fy;
736		c11 *= fx * fy;
737
738		return c00 + c10 + c01 + c11;
739	}
740
741	void *Surface::Buffer::lockRect(int x, int y, int z, Lock lock)
742	{
743		this->lock = lock;
744
745		switch(lock)
746		{
747		case LOCK_UNLOCKED:
748		case LOCK_READONLY:
749			break;
750		case LOCK_WRITEONLY:
751		case LOCK_READWRITE:
752		case LOCK_DISCARD:
753			dirty = true;
754			break;
755		default:
756			ASSERT(false);
757		}
758
759		if(buffer)
760		{
761			switch(format)
762			{
763			#if S3TC_SUPPORT
764			case FORMAT_DXT1:
765			#endif
766			case FORMAT_ATI1:
767			case FORMAT_ETC1:
768			case FORMAT_R11_EAC:
769			case FORMAT_SIGNED_R11_EAC:
770			case FORMAT_RGB8_ETC2:
771			case FORMAT_SRGB8_ETC2:
772			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
773			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
774				return (unsigned char*)buffer + 8 * (x / 4) + (y / 4) * pitchB + z * sliceB;
775			case FORMAT_RG11_EAC:
776			case FORMAT_SIGNED_RG11_EAC:
777			case FORMAT_RGBA8_ETC2_EAC:
778			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
779			case FORMAT_RGBA_ASTC_4x4_KHR:
780			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
781				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
782			case FORMAT_RGBA_ASTC_5x4_KHR:
783			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
784				return (unsigned char*)buffer + 16 * (x / 5) + (y / 4) * pitchB + z * sliceB;
785			case FORMAT_RGBA_ASTC_5x5_KHR:
786			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
787				return (unsigned char*)buffer + 16 * (x / 5) + (y / 5) * pitchB + z * sliceB;
788			case FORMAT_RGBA_ASTC_6x5_KHR:
789			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
790				return (unsigned char*)buffer + 16 * (x / 6) + (y / 5) * pitchB + z * sliceB;
791			case FORMAT_RGBA_ASTC_6x6_KHR:
792			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
793				return (unsigned char*)buffer + 16 * (x / 6) + (y / 6) * pitchB + z * sliceB;
794			case FORMAT_RGBA_ASTC_8x5_KHR:
795			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
796				return (unsigned char*)buffer + 16 * (x / 8) + (y / 5) * pitchB + z * sliceB;
797			case FORMAT_RGBA_ASTC_8x6_KHR:
798			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
799				return (unsigned char*)buffer + 16 * (x / 8) + (y / 6) * pitchB + z * sliceB;
800			case FORMAT_RGBA_ASTC_8x8_KHR:
801			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
802				return (unsigned char*)buffer + 16 * (x / 8) + (y / 8) * pitchB + z * sliceB;
803			case FORMAT_RGBA_ASTC_10x5_KHR:
804			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
805				return (unsigned char*)buffer + 16 * (x / 10) + (y / 5) * pitchB + z * sliceB;
806			case FORMAT_RGBA_ASTC_10x6_KHR:
807			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
808				return (unsigned char*)buffer + 16 * (x / 10) + (y / 6) * pitchB + z * sliceB;
809			case FORMAT_RGBA_ASTC_10x8_KHR:
810			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
811				return (unsigned char*)buffer + 16 * (x / 10) + (y / 8) * pitchB + z * sliceB;
812			case FORMAT_RGBA_ASTC_10x10_KHR:
813			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
814				return (unsigned char*)buffer + 16 * (x / 10) + (y / 10) * pitchB + z * sliceB;
815			case FORMAT_RGBA_ASTC_12x10_KHR:
816			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
817				return (unsigned char*)buffer + 16 * (x / 12) + (y / 10) * pitchB + z * sliceB;
818			case FORMAT_RGBA_ASTC_12x12_KHR:
819			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
820				return (unsigned char*)buffer + 16 * (x / 12) + (y / 12) * pitchB + z * sliceB;
821			#if S3TC_SUPPORT
822			case FORMAT_DXT3:
823			case FORMAT_DXT5:
824			#endif
825			case FORMAT_ATI2:
826				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
827			default:
828				return (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
829			}
830		}
831
832		return 0;
833	}
834
835	void Surface::Buffer::unlockRect()
836	{
837		lock = LOCK_UNLOCKED;
838	}
839
840	Surface::Surface(int width, int height, int depth, Format format, void *pixels, int pitch, int slice) : lockable(true), renderTarget(false)
841	{
842		resource = new Resource(0);
843		hasParent = false;
844		ownExternal = false;
845		depth = max(1, depth);
846
847		external.buffer = pixels;
848		external.width = width;
849		external.height = height;
850		external.depth = depth;
851		external.format = format;
852		external.bytes = bytes(external.format);
853		external.pitchB = pitch;
854		external.pitchP = pitch / external.bytes;
855		external.sliceB = slice;
856		external.sliceP = slice / external.bytes;
857		external.lock = LOCK_UNLOCKED;
858		external.dirty = true;
859
860		internal.buffer = 0;
861		internal.width = width;
862		internal.height = height;
863		internal.depth = depth;
864		internal.format = selectInternalFormat(format);
865		internal.bytes = bytes(internal.format);
866		internal.pitchB = pitchB(internal.width, internal.format, false);
867		internal.pitchP = pitchP(internal.width, internal.format, false);
868		internal.sliceB = sliceB(internal.width, internal.height, internal.format, false);
869		internal.sliceP = sliceP(internal.width, internal.height, internal.format, false);
870		internal.lock = LOCK_UNLOCKED;
871		internal.dirty = false;
872
873		stencil.buffer = 0;
874		stencil.width = width;
875		stencil.height = height;
876		stencil.depth = depth;
877		stencil.format = FORMAT_S8;
878		stencil.bytes = bytes(stencil.format);
879		stencil.pitchB = pitchB(stencil.width, stencil.format, false);
880		stencil.pitchP = pitchP(stencil.width, stencil.format, false);
881		stencil.sliceB = sliceB(stencil.width, stencil.height, stencil.format, false);
882		stencil.sliceP = sliceP(stencil.width, stencil.height, stencil.format, false);
883		stencil.lock = LOCK_UNLOCKED;
884		stencil.dirty = false;
885
886		dirtyMipmaps = true;
887		paletteUsed = 0;
888	}
889
890	Surface::Surface(Resource *texture, int width, int height, int depth, Format format, bool lockable, bool renderTarget) : lockable(lockable), renderTarget(renderTarget)
891	{
892		resource = texture ? texture : new Resource(0);
893		hasParent = texture != 0;
894		ownExternal = true;
895		depth = max(1, depth);
896
897		external.buffer = 0;
898		external.width = width;
899		external.height = height;
900		external.depth = depth;
901		external.format = format;
902		external.bytes = bytes(external.format);
903		external.pitchB = pitchB(external.width, external.format, renderTarget && !texture);
904		external.pitchP = pitchP(external.width, external.format, renderTarget && !texture);
905		external.sliceB = sliceB(external.width, external.height, external.format, renderTarget && !texture);
906		external.sliceP = sliceP(external.width, external.height, external.format, renderTarget && !texture);
907		external.lock = LOCK_UNLOCKED;
908		external.dirty = false;
909
910		internal.buffer = 0;
911		internal.width = width;
912		internal.height = height;
913		internal.depth = depth;
914		internal.format = selectInternalFormat(format);
915		internal.bytes = bytes(internal.format);
916		internal.pitchB = pitchB(internal.width, internal.format, renderTarget);
917		internal.pitchP = pitchP(internal.width, internal.format, renderTarget);
918		internal.sliceB = sliceB(internal.width, internal.height, internal.format, renderTarget);
919		internal.sliceP = sliceP(internal.width, internal.height, internal.format, renderTarget);
920		internal.lock = LOCK_UNLOCKED;
921		internal.dirty = false;
922
923		stencil.buffer = 0;
924		stencil.width = width;
925		stencil.height = height;
926		stencil.depth = depth;
927		stencil.format = FORMAT_S8;
928		stencil.bytes = bytes(stencil.format);
929		stencil.pitchB = pitchB(stencil.width, stencil.format, renderTarget);
930		stencil.pitchP = pitchP(stencil.width, stencil.format, renderTarget);
931		stencil.sliceB = sliceB(stencil.width, stencil.height, stencil.format, renderTarget);
932		stencil.sliceP = sliceP(stencil.width, stencil.height, stencil.format, renderTarget);
933		stencil.lock = LOCK_UNLOCKED;
934		stencil.dirty = false;
935
936		dirtyMipmaps = true;
937		paletteUsed = 0;
938	}
939
940	Surface::~Surface()
941	{
942		// Synchronize so we can deallocate the buffers below
943		resource->lock(DESTRUCT);
944		resource->unlock();
945
946		if(!hasParent)
947		{
948			resource->destruct();
949		}
950
951		if(ownExternal)
952		{
953			deallocate(external.buffer);
954		}
955
956		if(internal.buffer != external.buffer)
957		{
958			deallocate(internal.buffer);
959		}
960
961		deallocate(stencil.buffer);
962
963		external.buffer = 0;
964		internal.buffer = 0;
965		stencil.buffer = 0;
966	}
967
968	void *Surface::lockExternal(int x, int y, int z, Lock lock, Accessor client)
969	{
970		resource->lock(client);
971
972		if(!external.buffer)
973		{
974			if(internal.buffer && identicalFormats())
975			{
976				external.buffer = internal.buffer;
977			}
978			else
979			{
980				external.buffer = allocateBuffer(external.width, external.height, external.depth, external.format);
981			}
982		}
983
984		if(internal.dirty)
985		{
986			if(lock != LOCK_DISCARD)
987			{
988				update(external, internal);
989			}
990
991			internal.dirty = false;
992		}
993
994		switch(lock)
995		{
996		case LOCK_READONLY:
997			break;
998		case LOCK_WRITEONLY:
999		case LOCK_READWRITE:
1000		case LOCK_DISCARD:
1001			dirtyMipmaps = true;
1002			break;
1003		default:
1004			ASSERT(false);
1005		}
1006
1007		return external.lockRect(x, y, z, lock);
1008	}
1009
1010	void Surface::unlockExternal()
1011	{
1012		resource->unlock();
1013
1014		external.unlockRect();
1015	}
1016
1017	void *Surface::lockInternal(int x, int y, int z, Lock lock, Accessor client)
1018	{
1019		if(lock != LOCK_UNLOCKED)
1020		{
1021			resource->lock(client);
1022		}
1023
1024		if(!internal.buffer)
1025		{
1026			if(external.buffer && identicalFormats())
1027			{
1028				internal.buffer = external.buffer;
1029			}
1030			else
1031			{
1032				internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.format);
1033			}
1034		}
1035
1036		// FIXME: WHQL requires conversion to lower external precision and back
1037		if(logPrecision >= WHQL)
1038		{
1039			if(internal.dirty && renderTarget && internal.format != external.format)
1040			{
1041				if(lock != LOCK_DISCARD)
1042				{
1043					switch(external.format)
1044					{
1045					case FORMAT_R3G3B2:
1046					case FORMAT_A8R3G3B2:
1047					case FORMAT_A1R5G5B5:
1048					case FORMAT_A2R10G10B10:
1049					case FORMAT_A2B10G10R10:
1050						lockExternal(0, 0, 0, LOCK_READWRITE, client);
1051						unlockExternal();
1052						break;
1053					default:
1054						// Difference passes WHQL
1055						break;
1056					}
1057				}
1058			}
1059		}
1060
1061		if(external.dirty || (isPalette(external.format) && paletteUsed != Surface::paletteID))
1062		{
1063			if(lock != LOCK_DISCARD)
1064			{
1065				update(internal, external);
1066			}
1067
1068			external.dirty = false;
1069			paletteUsed = Surface::paletteID;
1070		}
1071
1072		switch(lock)
1073		{
1074		case LOCK_UNLOCKED:
1075		case LOCK_READONLY:
1076			break;
1077		case LOCK_WRITEONLY:
1078		case LOCK_READWRITE:
1079		case LOCK_DISCARD:
1080			dirtyMipmaps = true;
1081			break;
1082		default:
1083			ASSERT(false);
1084		}
1085
1086		if(lock == LOCK_READONLY && client == PUBLIC)
1087		{
1088			resolve();
1089		}
1090
1091		return internal.lockRect(x, y, z, lock);
1092	}
1093
1094	void Surface::unlockInternal()
1095	{
1096		resource->unlock();
1097
1098		internal.unlockRect();
1099	}
1100
1101	void *Surface::lockStencil(int front, Accessor client)
1102	{
1103		resource->lock(client);
1104
1105		if(!stencil.buffer)
1106		{
1107			stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.format);
1108		}
1109
1110		return stencil.lockRect(0, 0, front, LOCK_READWRITE);   // FIXME
1111	}
1112
1113	void Surface::unlockStencil()
1114	{
1115		resource->unlock();
1116
1117		stencil.unlockRect();
1118	}
1119
1120	int Surface::bytes(Format format)
1121	{
1122		switch(format)
1123		{
1124		case FORMAT_NULL:				return 0;
1125		case FORMAT_P8:					return 1;
1126		case FORMAT_A8P8:				return 2;
1127		case FORMAT_A8:					return 1;
1128		case FORMAT_R8:					return 1;
1129		case FORMAT_R3G3B2:				return 1;
1130		case FORMAT_A8R3G3B2:			return 2;
1131		case FORMAT_R5G6B5:				return 2;
1132		case FORMAT_A1R5G5B5:			return 2;
1133		case FORMAT_X1R5G5B5:			return 2;
1134		case FORMAT_R5G5B5A1:           return 2;
1135		case FORMAT_X4R4G4B4:			return 2;
1136		case FORMAT_A4R4G4B4:			return 2;
1137		case FORMAT_R4G4B4A4:           return 2;
1138		case FORMAT_R8G8B8:				return 3;
1139		case FORMAT_B8G8R8:             return 3;
1140		case FORMAT_X8R8G8B8:			return 4;
1141	//	case FORMAT_X8G8R8B8Q:			return 4;
1142		case FORMAT_A8R8G8B8:			return 4;
1143	//	case FORMAT_A8G8R8B8Q:			return 4;
1144		case FORMAT_X8B8G8R8:			return 4;
1145		case FORMAT_A8B8G8R8:			return 4;
1146		case FORMAT_A2R10G10B10:		return 4;
1147		case FORMAT_A2B10G10R10:		return 4;
1148		case FORMAT_G8R8:				return 2;
1149		case FORMAT_G16R16:				return 4;
1150		case FORMAT_A16B16G16R16:		return 8;
1151		// Compressed formats
1152		#if S3TC_SUPPORT
1153		case FORMAT_DXT1:				return 2;   // Column of four pixels
1154		case FORMAT_DXT3:				return 4;   // Column of four pixels
1155		case FORMAT_DXT5:				return 4;   // Column of four pixels
1156		#endif
1157		case FORMAT_ATI1:				return 2;   // Column of four pixels
1158		case FORMAT_ATI2:				return 4;   // Column of four pixels
1159		case FORMAT_ETC1:				return 2;   // Column of four pixels
1160		case FORMAT_R11_EAC:			return 2;
1161		case FORMAT_SIGNED_R11_EAC:		return 2;
1162		case FORMAT_RG11_EAC:			return 4;
1163		case FORMAT_SIGNED_RG11_EAC:	return 4;
1164		case FORMAT_RGB8_ETC2:			return 2;
1165		case FORMAT_SRGB8_ETC2:			return 2;
1166		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
1167		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
1168		case FORMAT_RGBA8_ETC2_EAC:			return 4;
1169		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:	return 4;
1170		case FORMAT_RGBA_ASTC_4x4_KHR:
1171		case FORMAT_RGBA_ASTC_5x4_KHR:
1172		case FORMAT_RGBA_ASTC_5x5_KHR:
1173		case FORMAT_RGBA_ASTC_6x5_KHR:
1174		case FORMAT_RGBA_ASTC_6x6_KHR:
1175		case FORMAT_RGBA_ASTC_8x5_KHR:
1176		case FORMAT_RGBA_ASTC_8x6_KHR:
1177		case FORMAT_RGBA_ASTC_8x8_KHR:
1178		case FORMAT_RGBA_ASTC_10x5_KHR:
1179		case FORMAT_RGBA_ASTC_10x6_KHR:
1180		case FORMAT_RGBA_ASTC_10x8_KHR:
1181		case FORMAT_RGBA_ASTC_10x10_KHR:
1182		case FORMAT_RGBA_ASTC_12x10_KHR:
1183		case FORMAT_RGBA_ASTC_12x12_KHR:
1184		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1185		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1186		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1187		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1188		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1189		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1190		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1191		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1192		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1193		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1194		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1195		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1196		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1197		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: return 0; // FIXME
1198		// Bumpmap formats
1199		case FORMAT_V8U8:				return 2;
1200		case FORMAT_L6V5U5:				return 2;
1201		case FORMAT_Q8W8V8U8:			return 4;
1202		case FORMAT_X8L8V8U8:			return 4;
1203		case FORMAT_A2W10V10U10:		return 4;
1204		case FORMAT_V16U16:				return 4;
1205		case FORMAT_A16W16V16U16:		return 8;
1206		case FORMAT_Q16W16V16U16:		return 8;
1207		// Luminance formats
1208		case FORMAT_L8:					return 1;
1209		case FORMAT_A4L4:				return 1;
1210		case FORMAT_L16:				return 2;
1211		case FORMAT_A8L8:				return 2;
1212		case FORMAT_L16F:               return 2;
1213		case FORMAT_A16L16F:            return 4;
1214		case FORMAT_L32F:               return 4;
1215		case FORMAT_A32L32F:            return 8;
1216		// Floating-point formats
1217		case FORMAT_A16F:				return 2;
1218		case FORMAT_R16F:				return 2;
1219		case FORMAT_G16R16F:			return 4;
1220		case FORMAT_B16G16R16F:			return 6;
1221		case FORMAT_A16B16G16R16F:		return 8;
1222		case FORMAT_A32F:				return 4;
1223		case FORMAT_R32F:				return 4;
1224		case FORMAT_G32R32F:			return 8;
1225		case FORMAT_B32G32R32F:			return 12;
1226		case FORMAT_A32B32G32R32F:		return 16;
1227		// Depth/stencil formats
1228		case FORMAT_D16:				return 2;
1229		case FORMAT_D32:				return 4;
1230		case FORMAT_D24X8:				return 4;
1231		case FORMAT_D24S8:				return 4;
1232		case FORMAT_D24FS8:				return 4;
1233		case FORMAT_D32F:				return 4;
1234		case FORMAT_D32F_COMPLEMENTARY:	return 4;
1235		case FORMAT_D32F_LOCKABLE:		return 4;
1236		case FORMAT_D32FS8_TEXTURE:		return 4;
1237		case FORMAT_D32FS8_SHADOW:		return 4;
1238		case FORMAT_DF24S8:				return 4;
1239		case FORMAT_DF16S8:				return 2;
1240		case FORMAT_INTZ:				return 4;
1241		case FORMAT_S8:					return 1;
1242		case FORMAT_YV12_BT601:         return 1;   // Y plane only
1243		case FORMAT_YV12_BT709:         return 1;   // Y plane only
1244		case FORMAT_YV12_JFIF:          return 1;   // Y plane only
1245		default:
1246			ASSERT(false);
1247		}
1248
1249		return 0;
1250	}
1251
1252	int Surface::pitchB(int width, Format format, bool target)
1253	{
1254		if(target || isDepth(format) || isStencil(format))
1255		{
1256			width = align(width, 2);
1257		}
1258
1259		switch(format)
1260		{
1261		#if S3TC_SUPPORT
1262		case FORMAT_DXT1:
1263		#endif
1264		case FORMAT_ETC1:
1265		case FORMAT_R11_EAC:
1266		case FORMAT_SIGNED_R11_EAC:
1267		case FORMAT_RGB8_ETC2:
1268		case FORMAT_SRGB8_ETC2:
1269		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1270		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1271			return 8 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per 4 rows
1272		case FORMAT_RG11_EAC:
1273		case FORMAT_SIGNED_RG11_EAC:
1274		case FORMAT_RGBA8_ETC2_EAC:
1275		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1276		case FORMAT_RGBA_ASTC_4x4_KHR:
1277		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1278			return 16 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per 4 rows
1279		case FORMAT_RGBA_ASTC_5x4_KHR:
1280		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1281		case FORMAT_RGBA_ASTC_5x5_KHR:
1282		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1283			return 16 * ((width + 4) / 5);
1284		case FORMAT_RGBA_ASTC_6x5_KHR:
1285		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1286		case FORMAT_RGBA_ASTC_6x6_KHR:
1287		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1288			return 16 * ((width + 5) / 6);
1289		case FORMAT_RGBA_ASTC_8x5_KHR:
1290		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1291		case FORMAT_RGBA_ASTC_8x6_KHR:
1292		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1293		case FORMAT_RGBA_ASTC_8x8_KHR:
1294		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1295			return 16 * ((width + 7) / 8);
1296		case FORMAT_RGBA_ASTC_10x5_KHR:
1297		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1298		case FORMAT_RGBA_ASTC_10x6_KHR:
1299		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1300		case FORMAT_RGBA_ASTC_10x8_KHR:
1301		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1302		case FORMAT_RGBA_ASTC_10x10_KHR:
1303		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1304			return 16 * ((width + 9) / 10);
1305		case FORMAT_RGBA_ASTC_12x10_KHR:
1306		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1307		case FORMAT_RGBA_ASTC_12x12_KHR:
1308		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1309			return 16 * ((width + 11) / 12);
1310		#if S3TC_SUPPORT
1311		case FORMAT_DXT3:
1312		case FORMAT_DXT5:
1313			return 16 * ((width + 3) / 4);   // 128 bit per 4x4 block, computed per 4 rows
1314		#endif
1315		case FORMAT_ATI1:
1316			return 2 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per row
1317		case FORMAT_ATI2:
1318			return 4 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per row
1319		case FORMAT_YV12_BT601:
1320		case FORMAT_YV12_BT709:
1321		case FORMAT_YV12_JFIF:
1322			return align(width, 16);
1323		default:
1324			return bytes(format) * width;
1325		}
1326	}
1327
1328	int Surface::pitchP(int width, Format format, bool target)
1329	{
1330		int B = bytes(format);
1331
1332		return B > 0 ? pitchB(width, format, target) / B : 0;
1333	}
1334
1335	int Surface::sliceB(int width, int height, Format format, bool target)
1336	{
1337		if(target || isDepth(format) || isStencil(format))
1338		{
1339			height = ((height + 1) & ~1);
1340		}
1341
1342		switch(format)
1343		{
1344		#if S3TC_SUPPORT
1345		case FORMAT_DXT1:
1346		case FORMAT_DXT3:
1347		case FORMAT_DXT5:
1348		#endif
1349		case FORMAT_ETC1:
1350		case FORMAT_R11_EAC:
1351		case FORMAT_SIGNED_R11_EAC:
1352		case FORMAT_RG11_EAC:
1353		case FORMAT_SIGNED_RG11_EAC:
1354		case FORMAT_RGB8_ETC2:
1355		case FORMAT_SRGB8_ETC2:
1356		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1357		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1358		case FORMAT_RGBA8_ETC2_EAC:
1359		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1360		case FORMAT_RGBA_ASTC_4x4_KHR:
1361		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1362		case FORMAT_RGBA_ASTC_5x4_KHR:
1363		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1364			return pitchB(width, format, target) * ((height + 3) / 4);   // Pitch computed per 4 rows
1365		case FORMAT_RGBA_ASTC_5x5_KHR:
1366		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1367		case FORMAT_RGBA_ASTC_6x5_KHR:
1368		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1369		case FORMAT_RGBA_ASTC_8x5_KHR:
1370		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1371		case FORMAT_RGBA_ASTC_10x5_KHR:
1372		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1373			return pitchB(width, format, target) * ((height + 4) / 5);   // Pitch computed per 5 rows
1374		case FORMAT_RGBA_ASTC_6x6_KHR:
1375		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1376		case FORMAT_RGBA_ASTC_8x6_KHR:
1377		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1378		case FORMAT_RGBA_ASTC_10x6_KHR:
1379		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1380			return pitchB(width, format, target) * ((height + 5) / 6);   // Pitch computed per 6 rows
1381		case FORMAT_RGBA_ASTC_8x8_KHR:
1382		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1383		case FORMAT_RGBA_ASTC_10x8_KHR:
1384		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1385			return pitchB(width, format, target) * ((height + 7) / 8);   // Pitch computed per 8 rows
1386		case FORMAT_RGBA_ASTC_10x10_KHR:
1387		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1388		case FORMAT_RGBA_ASTC_12x10_KHR:
1389		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1390			return pitchB(width, format, target) * ((height + 9) / 10);   // Pitch computed per 10 rows
1391		case FORMAT_RGBA_ASTC_12x12_KHR:
1392		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1393			return pitchB(width, format, target) * ((height + 11) / 12);   // Pitch computed per 12 rows
1394		case FORMAT_ATI1:
1395		case FORMAT_ATI2:
1396		default:
1397			return pitchB(width, format, target) * height;   // Pitch computed per row
1398		}
1399	}
1400
1401	int Surface::sliceP(int width, int height, Format format, bool target)
1402	{
1403		int B = bytes(format);
1404
1405		return B > 0 ? sliceB(width, height, format, target) / B : 0;
1406	}
1407
1408	void Surface::update(Buffer &destination, Buffer &source)
1409	{
1410	//	ASSERT(source.lock != LOCK_UNLOCKED);
1411	//	ASSERT(destination.lock != LOCK_UNLOCKED);
1412
1413		if(destination.buffer != source.buffer)
1414		{
1415			ASSERT(source.dirty && !destination.dirty);
1416
1417			switch(source.format)
1418			{
1419			case FORMAT_R8G8B8:		decodeR8G8B8(destination, source);		break;   // FIXME: Check destination format
1420			case FORMAT_X1R5G5B5:	decodeX1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1421			case FORMAT_A1R5G5B5:	decodeA1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1422			case FORMAT_X4R4G4B4:	decodeX4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1423			case FORMAT_A4R4G4B4:	decodeA4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1424			case FORMAT_P8:			decodeP8(destination, source);			break;   // FIXME: Check destination format
1425			#if S3TC_SUPPORT
1426			case FORMAT_DXT1:		decodeDXT1(destination, source);		break;   // FIXME: Check destination format
1427			case FORMAT_DXT3:		decodeDXT3(destination, source);		break;   // FIXME: Check destination format
1428			case FORMAT_DXT5:		decodeDXT5(destination, source);		break;   // FIXME: Check destination format
1429			#endif
1430			case FORMAT_ATI1:		decodeATI1(destination, source);		break;   // FIXME: Check destination format
1431			case FORMAT_ATI2:		decodeATI2(destination, source);		break;   // FIXME: Check destination format
1432			case FORMAT_R11_EAC:         decodeEAC(destination, source, 1, false); break; // FIXME: Check destination format
1433			case FORMAT_SIGNED_R11_EAC:  decodeEAC(destination, source, 1, true);  break; // FIXME: Check destination format
1434			case FORMAT_RG11_EAC:        decodeEAC(destination, source, 2, false); break; // FIXME: Check destination format
1435			case FORMAT_SIGNED_RG11_EAC: decodeEAC(destination, source, 2, true);  break; // FIXME: Check destination format
1436			case FORMAT_ETC1:
1437			case FORMAT_RGB8_ETC2:                      decodeETC2(destination, source, 0, false); break; // FIXME: Check destination format
1438			case FORMAT_SRGB8_ETC2:                     decodeETC2(destination, source, 0, true);  break; // FIXME: Check destination format
1439			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:  decodeETC2(destination, source, 1, false); break; // FIXME: Check destination format
1440			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2: decodeETC2(destination, source, 1, true);  break; // FIXME: Check destination format
1441			case FORMAT_RGBA8_ETC2_EAC:                 decodeETC2(destination, source, 8, false); break; // FIXME: Check destination format
1442			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:          decodeETC2(destination, source, 8, true);  break; // FIXME: Check destination format
1443			case FORMAT_RGBA_ASTC_4x4_KHR:           decodeASTC(destination, source, 4,  4,  1, false); break; // FIXME: Check destination format
1444			case FORMAT_RGBA_ASTC_5x4_KHR:           decodeASTC(destination, source, 5,  4,  1, false); break; // FIXME: Check destination format
1445			case FORMAT_RGBA_ASTC_5x5_KHR:           decodeASTC(destination, source, 5,  5,  1, false); break; // FIXME: Check destination format
1446			case FORMAT_RGBA_ASTC_6x5_KHR:           decodeASTC(destination, source, 6,  5,  1, false); break; // FIXME: Check destination format
1447			case FORMAT_RGBA_ASTC_6x6_KHR:           decodeASTC(destination, source, 6,  6,  1, false); break; // FIXME: Check destination format
1448			case FORMAT_RGBA_ASTC_8x5_KHR:           decodeASTC(destination, source, 8,  5,  1, false); break; // FIXME: Check destination format
1449			case FORMAT_RGBA_ASTC_8x6_KHR:           decodeASTC(destination, source, 8,  6,  1, false); break; // FIXME: Check destination format
1450			case FORMAT_RGBA_ASTC_8x8_KHR:           decodeASTC(destination, source, 8,  8,  1, false); break; // FIXME: Check destination format
1451			case FORMAT_RGBA_ASTC_10x5_KHR:          decodeASTC(destination, source, 10, 5,  1, false); break; // FIXME: Check destination format
1452			case FORMAT_RGBA_ASTC_10x6_KHR:          decodeASTC(destination, source, 10, 6,  1, false); break; // FIXME: Check destination format
1453			case FORMAT_RGBA_ASTC_10x8_KHR:          decodeASTC(destination, source, 10, 8,  1, false); break; // FIXME: Check destination format
1454			case FORMAT_RGBA_ASTC_10x10_KHR:         decodeASTC(destination, source, 10, 10, 1, false); break; // FIXME: Check destination format
1455			case FORMAT_RGBA_ASTC_12x10_KHR:         decodeASTC(destination, source, 12, 10, 1, false); break; // FIXME: Check destination format
1456			case FORMAT_RGBA_ASTC_12x12_KHR:         decodeASTC(destination, source, 12, 12, 1, false); break; // FIXME: Check destination format
1457			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:   decodeASTC(destination, source, 4,  4,  1, true);  break; // FIXME: Check destination format
1458			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:   decodeASTC(destination, source, 5,  4,  1, true);  break; // FIXME: Check destination format
1459			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:   decodeASTC(destination, source, 5,  5,  1, true);  break; // FIXME: Check destination format
1460			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:   decodeASTC(destination, source, 6,  5,  1, true);  break; // FIXME: Check destination format
1461			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:   decodeASTC(destination, source, 6,  6,  1, true);  break; // FIXME: Check destination format
1462			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:   decodeASTC(destination, source, 8,  5,  1, true);  break; // FIXME: Check destination format
1463			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:   decodeASTC(destination, source, 8,  6,  1, true);  break; // FIXME: Check destination format
1464			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:   decodeASTC(destination, source, 8,  8,  1, true);  break; // FIXME: Check destination format
1465			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:  decodeASTC(destination, source, 10, 5,  1, true);  break; // FIXME: Check destination format
1466			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:  decodeASTC(destination, source, 10, 6,  1, true);  break; // FIXME: Check destination format
1467			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:  decodeASTC(destination, source, 10, 8,  1, true);  break; // FIXME: Check destination format
1468			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR: decodeASTC(destination, source, 10, 10, 1, true);  break; // FIXME: Check destination format
1469			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR: decodeASTC(destination, source, 12, 10, 1, true);  break; // FIXME: Check destination format
1470			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: decodeASTC(destination, source, 12, 12, 1, true);  break; // FIXME: Check destination format
1471			default:				genericUpdate(destination, source);		break;
1472			}
1473		}
1474	}
1475
1476	void Surface::genericUpdate(Buffer &destination, Buffer &source)
1477	{
1478		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1479		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1480
1481		int depth = min(destination.depth, source.depth);
1482		int height = min(destination.height, source.height);
1483		int width = min(destination.width, source.width);
1484		int rowBytes = width * source.bytes;
1485
1486		for(int z = 0; z < depth; z++)
1487		{
1488			unsigned char *sourceRow = sourceSlice;
1489			unsigned char *destinationRow = destinationSlice;
1490
1491			for(int y = 0; y < height; y++)
1492			{
1493				if(source.format == destination.format)
1494				{
1495					memcpy(destinationRow, sourceRow, rowBytes);
1496				}
1497				else
1498				{
1499					unsigned char *sourceElement = sourceRow;
1500					unsigned char *destinationElement = destinationRow;
1501
1502					for(int x = 0; x < width; x++)
1503					{
1504						Color<float> color = source.read(sourceElement);
1505						destination.write(destinationElement, color);
1506
1507						sourceElement += source.bytes;
1508						destinationElement += destination.bytes;
1509					}
1510				}
1511
1512				sourceRow += source.pitchB;
1513				destinationRow += destination.pitchB;
1514			}
1515
1516			sourceSlice += source.sliceB;
1517			destinationSlice += destination.sliceB;
1518		}
1519	}
1520
1521	void Surface::decodeR8G8B8(Buffer &destination, const Buffer &source)
1522	{
1523		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1524		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1525
1526		for(int z = 0; z < destination.depth && z < source.depth; z++)
1527		{
1528			unsigned char *sourceRow = sourceSlice;
1529			unsigned char *destinationRow = destinationSlice;
1530
1531			for(int y = 0; y < destination.height && y < source.height; y++)
1532			{
1533				unsigned char *sourceElement = sourceRow;
1534				unsigned char *destinationElement = destinationRow;
1535
1536				for(int x = 0; x < destination.width && x < source.width; x++)
1537				{
1538					unsigned int b = sourceElement[0];
1539					unsigned int g = sourceElement[1];
1540					unsigned int r = sourceElement[2];
1541
1542					*(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0);
1543
1544					sourceElement += source.bytes;
1545					destinationElement += destination.bytes;
1546				}
1547
1548				sourceRow += source.pitchB;
1549				destinationRow += destination.pitchB;
1550			}
1551
1552			sourceSlice += source.sliceB;
1553			destinationSlice += destination.sliceB;
1554		}
1555	}
1556
1557	void Surface::decodeX1R5G5B5(Buffer &destination, const Buffer &source)
1558	{
1559		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1560		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1561
1562		for(int z = 0; z < destination.depth && z < source.depth; z++)
1563		{
1564			unsigned char *sourceRow = sourceSlice;
1565			unsigned char *destinationRow = destinationSlice;
1566
1567			for(int y = 0; y < destination.height && y < source.height; y++)
1568			{
1569				unsigned char *sourceElement = sourceRow;
1570				unsigned char *destinationElement = destinationRow;
1571
1572				for(int x = 0; x < destination.width && x < source.width; x++)
1573				{
1574					unsigned int xrgb = *(unsigned short*)sourceElement;
1575
1576					unsigned int r = (((xrgb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
1577					unsigned int g = (((xrgb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
1578					unsigned int b = (((xrgb & 0x001F) * 2106  + 0x80) >> 8);
1579
1580					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
1581
1582					sourceElement += source.bytes;
1583					destinationElement += destination.bytes;
1584				}
1585
1586				sourceRow += source.pitchB;
1587				destinationRow += destination.pitchB;
1588			}
1589
1590			sourceSlice += source.sliceB;
1591			destinationSlice += destination.sliceB;
1592		}
1593	}
1594
1595	void Surface::decodeA1R5G5B5(Buffer &destination, const Buffer &source)
1596	{
1597		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1598		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1599
1600		for(int z = 0; z < destination.depth && z < source.depth; z++)
1601		{
1602			unsigned char *sourceRow = sourceSlice;
1603			unsigned char *destinationRow = destinationSlice;
1604
1605			for(int y = 0; y < destination.height && y < source.height; y++)
1606			{
1607				unsigned char *sourceElement = sourceRow;
1608				unsigned char *destinationElement = destinationRow;
1609
1610				for(int x = 0; x < destination.width && x < source.width; x++)
1611				{
1612					unsigned int argb = *(unsigned short*)sourceElement;
1613
1614					unsigned int a =   (argb & 0x8000) * 130560;
1615					unsigned int r = (((argb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
1616					unsigned int g = (((argb & 0x03E0) * 16846  + 0x8000) >> 8) & 0x0000FF00;
1617					unsigned int b = (((argb & 0x001F) * 2106   + 0x80) >> 8);
1618
1619					*(unsigned int*)destinationElement = a | r | g | b;
1620
1621					sourceElement += source.bytes;
1622					destinationElement += destination.bytes;
1623				}
1624
1625				sourceRow += source.pitchB;
1626				destinationRow += destination.pitchB;
1627			}
1628
1629			sourceSlice += source.sliceB;
1630			destinationSlice += destination.sliceB;
1631		}
1632	}
1633
1634	void Surface::decodeX4R4G4B4(Buffer &destination, const Buffer &source)
1635	{
1636		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1637		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1638
1639		for(int z = 0; z < destination.depth && z < source.depth; z++)
1640		{
1641			unsigned char *sourceRow = sourceSlice;
1642			unsigned char *destinationRow = destinationSlice;
1643
1644			for(int y = 0; y < destination.height && y < source.height; y++)
1645			{
1646				unsigned char *sourceElement = sourceRow;
1647				unsigned char *destinationElement = destinationRow;
1648
1649				for(int x = 0; x < destination.width && x < source.width; x++)
1650				{
1651					unsigned int xrgb = *(unsigned short*)sourceElement;
1652
1653					unsigned int r = ((xrgb & 0x0F00) * 0x00001100) & 0x00FF0000;
1654					unsigned int g = ((xrgb & 0x00F0) * 0x00000110) & 0x0000FF00;
1655					unsigned int b =  (xrgb & 0x000F) * 0x00000011;
1656
1657					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
1658
1659					sourceElement += source.bytes;
1660					destinationElement += destination.bytes;
1661				}
1662
1663				sourceRow += source.pitchB;
1664				destinationRow += destination.pitchB;
1665			}
1666
1667			sourceSlice += source.sliceB;
1668			destinationSlice += destination.sliceB;
1669		}
1670	}
1671
1672	void Surface::decodeA4R4G4B4(Buffer &destination, const Buffer &source)
1673	{
1674		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1675		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1676
1677		for(int z = 0; z < destination.depth && z < source.depth; z++)
1678		{
1679			unsigned char *sourceRow = sourceSlice;
1680			unsigned char *destinationRow = destinationSlice;
1681
1682			for(int y = 0; y < destination.height && y < source.height; y++)
1683			{
1684				unsigned char *sourceElement = sourceRow;
1685				unsigned char *destinationElement = destinationRow;
1686
1687				for(int x = 0; x < destination.width && x < source.width; x++)
1688				{
1689					unsigned int argb = *(unsigned short*)sourceElement;
1690
1691					unsigned int a = ((argb & 0xF000) * 0x00011000) & 0xFF000000;
1692					unsigned int r = ((argb & 0x0F00) * 0x00001100) & 0x00FF0000;
1693					unsigned int g = ((argb & 0x00F0) * 0x00000110) & 0x0000FF00;
1694					unsigned int b =  (argb & 0x000F) * 0x00000011;
1695
1696					*(unsigned int*)destinationElement = a | r | g | b;
1697
1698					sourceElement += source.bytes;
1699					destinationElement += destination.bytes;
1700				}
1701
1702				sourceRow += source.pitchB;
1703				destinationRow += destination.pitchB;
1704			}
1705
1706			sourceSlice += source.sliceB;
1707			destinationSlice += destination.sliceB;
1708		}
1709	}
1710
1711	void Surface::decodeP8(Buffer &destination, const Buffer &source)
1712	{
1713		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1714		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1715
1716		for(int z = 0; z < destination.depth && z < source.depth; z++)
1717		{
1718			unsigned char *sourceRow = sourceSlice;
1719			unsigned char *destinationRow = destinationSlice;
1720
1721			for(int y = 0; y < destination.height && y < source.height; y++)
1722			{
1723				unsigned char *sourceElement = sourceRow;
1724				unsigned char *destinationElement = destinationRow;
1725
1726				for(int x = 0; x < destination.width && x < source.width; x++)
1727				{
1728					unsigned int abgr = palette[*(unsigned char*)sourceElement];
1729
1730					unsigned int r = (abgr & 0x000000FF) << 16;
1731					unsigned int g = (abgr & 0x0000FF00) << 0;
1732					unsigned int b = (abgr & 0x00FF0000) >> 16;
1733					unsigned int a = (abgr & 0xFF000000) >> 0;
1734
1735					*(unsigned int*)destinationElement = a | r | g | b;
1736
1737					sourceElement += source.bytes;
1738					destinationElement += destination.bytes;
1739				}
1740
1741				sourceRow += source.pitchB;
1742				destinationRow += destination.pitchB;
1743			}
1744
1745			sourceSlice += source.sliceB;
1746			destinationSlice += destination.sliceB;
1747		}
1748	}
1749
1750#if S3TC_SUPPORT
1751	void Surface::decodeDXT1(Buffer &internal, const Buffer &external)
1752	{
1753		unsigned int *destSlice = (unsigned int*)internal.buffer;
1754		const DXT1 *source = (const DXT1*)external.buffer;
1755
1756		for(int z = 0; z < external.depth; z++)
1757		{
1758			unsigned int *dest = destSlice;
1759
1760			for(int y = 0; y < external.height; y += 4)
1761			{
1762				for(int x = 0; x < external.width; x += 4)
1763				{
1764					Color<byte> c[4];
1765
1766					c[0] = source->c0;
1767					c[1] = source->c1;
1768
1769					if(source->c0 > source->c1)   // No transparency
1770					{
1771						// c2 = 2 / 3 * c0 + 1 / 3 * c1
1772						c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
1773						c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
1774						c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
1775						c[2].a = 0xFF;
1776
1777						// c3 = 1 / 3 * c0 + 2 / 3 * c1
1778						c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
1779						c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
1780						c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
1781						c[3].a = 0xFF;
1782					}
1783					else   // c3 transparent
1784					{
1785						// c2 = 1 / 2 * c0 + 1 / 2 * c1
1786						c[2].r = (byte)(((word)c[0].r + (word)c[1].r) / 2);
1787						c[2].g = (byte)(((word)c[0].g + (word)c[1].g) / 2);
1788						c[2].b = (byte)(((word)c[0].b + (word)c[1].b) / 2);
1789						c[2].a = 0xFF;
1790
1791						c[3].r = 0;
1792						c[3].g = 0;
1793						c[3].b = 0;
1794						c[3].a = 0;
1795					}
1796
1797					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
1798					{
1799						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
1800						{
1801							dest[(x + i) + (y + j) * internal.width] = c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4];
1802						}
1803					}
1804
1805					source++;
1806				}
1807			}
1808
1809			(byte*&)destSlice += internal.sliceB;
1810		}
1811	}
1812
1813	void Surface::decodeDXT3(Buffer &internal, const Buffer &external)
1814	{
1815		unsigned int *destSlice = (unsigned int*)internal.buffer;
1816		const DXT3 *source = (const DXT3*)external.buffer;
1817
1818		for(int z = 0; z < external.depth; z++)
1819		{
1820			unsigned int *dest = destSlice;
1821
1822			for(int y = 0; y < external.height; y += 4)
1823			{
1824				for(int x = 0; x < external.width; x += 4)
1825				{
1826					Color<byte> c[4];
1827
1828					c[0] = source->c0;
1829					c[1] = source->c1;
1830
1831					// c2 = 2 / 3 * c0 + 1 / 3 * c1
1832					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
1833					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
1834					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
1835
1836					// c3 = 1 / 3 * c0 + 2 / 3 * c1
1837					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
1838					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
1839					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
1840
1841					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
1842					{
1843						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
1844						{
1845							unsigned int a = (unsigned int)(source->a >> 4 * (i + j * 4)) & 0x0F;
1846							unsigned int color = (c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | ((a << 28) + (a << 24));
1847
1848							dest[(x + i) + (y + j) * internal.width] = color;
1849						}
1850					}
1851
1852					source++;
1853				}
1854			}
1855
1856			(byte*&)destSlice += internal.sliceB;
1857		}
1858	}
1859
1860	void Surface::decodeDXT5(Buffer &internal, const Buffer &external)
1861	{
1862		unsigned int *destSlice = (unsigned int*)internal.buffer;
1863		const DXT5 *source = (const DXT5*)external.buffer;
1864
1865		for(int z = 0; z < external.depth; z++)
1866		{
1867			unsigned int *dest = destSlice;
1868
1869			for(int y = 0; y < external.height; y += 4)
1870			{
1871				for(int x = 0; x < external.width; x += 4)
1872				{
1873					Color<byte> c[4];
1874
1875					c[0] = source->c0;
1876					c[1] = source->c1;
1877
1878					// c2 = 2 / 3 * c0 + 1 / 3 * c1
1879					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
1880					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
1881					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
1882
1883					// c3 = 1 / 3 * c0 + 2 / 3 * c1
1884					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
1885					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
1886					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
1887
1888					byte a[8];
1889
1890					a[0] = source->a0;
1891					a[1] = source->a1;
1892
1893					if(a[0] > a[1])
1894					{
1895						a[2] = (byte)((6 * (word)a[0] + 1 * (word)a[1] + 3) / 7);
1896						a[3] = (byte)((5 * (word)a[0] + 2 * (word)a[1] + 3) / 7);
1897						a[4] = (byte)((4 * (word)a[0] + 3 * (word)a[1] + 3) / 7);
1898						a[5] = (byte)((3 * (word)a[0] + 4 * (word)a[1] + 3) / 7);
1899						a[6] = (byte)((2 * (word)a[0] + 5 * (word)a[1] + 3) / 7);
1900						a[7] = (byte)((1 * (word)a[0] + 6 * (word)a[1] + 3) / 7);
1901					}
1902					else
1903					{
1904						a[2] = (byte)((4 * (word)a[0] + 1 * (word)a[1] + 2) / 5);
1905						a[3] = (byte)((3 * (word)a[0] + 2 * (word)a[1] + 2) / 5);
1906						a[4] = (byte)((2 * (word)a[0] + 3 * (word)a[1] + 2) / 5);
1907						a[5] = (byte)((1 * (word)a[0] + 4 * (word)a[1] + 2) / 5);
1908						a[6] = 0;
1909						a[7] = 0xFF;
1910					}
1911
1912					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
1913					{
1914						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
1915						{
1916							unsigned int alpha = (unsigned int)a[(unsigned int)(source->alut >> (16 + 3 * (i + j * 4))) % 8] << 24;
1917							unsigned int color = (c[(source->clut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | alpha;
1918
1919							dest[(x + i) + (y + j) * internal.width] = color;
1920						}
1921					}
1922
1923					source++;
1924				}
1925			}
1926
1927			(byte*&)destSlice += internal.sliceB;
1928		}
1929	}
1930#endif
1931
1932	void Surface::decodeATI1(Buffer &internal, const Buffer &external)
1933	{
1934		byte *destSlice = (byte*)internal.buffer;
1935		const ATI1 *source = (const ATI1*)external.buffer;
1936
1937		for(int z = 0; z < external.depth; z++)
1938		{
1939			byte *dest = destSlice;
1940
1941			for(int y = 0; y < external.height; y += 4)
1942			{
1943				for(int x = 0; x < external.width; x += 4)
1944				{
1945					byte r[8];
1946
1947					r[0] = source->r0;
1948					r[1] = source->r1;
1949
1950					if(r[0] > r[1])
1951					{
1952						r[2] = (byte)((6 * (word)r[0] + 1 * (word)r[1] + 3) / 7);
1953						r[3] = (byte)((5 * (word)r[0] + 2 * (word)r[1] + 3) / 7);
1954						r[4] = (byte)((4 * (word)r[0] + 3 * (word)r[1] + 3) / 7);
1955						r[5] = (byte)((3 * (word)r[0] + 4 * (word)r[1] + 3) / 7);
1956						r[6] = (byte)((2 * (word)r[0] + 5 * (word)r[1] + 3) / 7);
1957						r[7] = (byte)((1 * (word)r[0] + 6 * (word)r[1] + 3) / 7);
1958					}
1959					else
1960					{
1961						r[2] = (byte)((4 * (word)r[0] + 1 * (word)r[1] + 2) / 5);
1962						r[3] = (byte)((3 * (word)r[0] + 2 * (word)r[1] + 2) / 5);
1963						r[4] = (byte)((2 * (word)r[0] + 3 * (word)r[1] + 2) / 5);
1964						r[5] = (byte)((1 * (word)r[0] + 4 * (word)r[1] + 2) / 5);
1965						r[6] = 0;
1966						r[7] = 0xFF;
1967					}
1968
1969					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
1970					{
1971						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
1972						{
1973							dest[(x + i) + (y + j) * internal.width] = r[(unsigned int)(source->rlut >> (16 + 3 * (i + j * 4))) % 8];
1974						}
1975					}
1976
1977					source++;
1978				}
1979			}
1980
1981			destSlice += internal.sliceB;
1982		}
1983	}
1984
1985	void Surface::decodeATI2(Buffer &internal, const Buffer &external)
1986	{
1987		word *destSlice = (word*)internal.buffer;
1988		const ATI2 *source = (const ATI2*)external.buffer;
1989
1990		for(int z = 0; z < external.depth; z++)
1991		{
1992			word *dest = destSlice;
1993
1994			for(int y = 0; y < external.height; y += 4)
1995			{
1996				for(int x = 0; x < external.width; x += 4)
1997				{
1998					byte X[8];
1999
2000					X[0] = source->x0;
2001					X[1] = source->x1;
2002
2003					if(X[0] > X[1])
2004					{
2005						X[2] = (byte)((6 * (word)X[0] + 1 * (word)X[1] + 3) / 7);
2006						X[3] = (byte)((5 * (word)X[0] + 2 * (word)X[1] + 3) / 7);
2007						X[4] = (byte)((4 * (word)X[0] + 3 * (word)X[1] + 3) / 7);
2008						X[5] = (byte)((3 * (word)X[0] + 4 * (word)X[1] + 3) / 7);
2009						X[6] = (byte)((2 * (word)X[0] + 5 * (word)X[1] + 3) / 7);
2010						X[7] = (byte)((1 * (word)X[0] + 6 * (word)X[1] + 3) / 7);
2011					}
2012					else
2013					{
2014						X[2] = (byte)((4 * (word)X[0] + 1 * (word)X[1] + 2) / 5);
2015						X[3] = (byte)((3 * (word)X[0] + 2 * (word)X[1] + 2) / 5);
2016						X[4] = (byte)((2 * (word)X[0] + 3 * (word)X[1] + 2) / 5);
2017						X[5] = (byte)((1 * (word)X[0] + 4 * (word)X[1] + 2) / 5);
2018						X[6] = 0;
2019						X[7] = 0xFF;
2020					}
2021
2022					byte Y[8];
2023
2024					Y[0] = source->y0;
2025					Y[1] = source->y1;
2026
2027					if(Y[0] > Y[1])
2028					{
2029						Y[2] = (byte)((6 * (word)Y[0] + 1 * (word)Y[1] + 3) / 7);
2030						Y[3] = (byte)((5 * (word)Y[0] + 2 * (word)Y[1] + 3) / 7);
2031						Y[4] = (byte)((4 * (word)Y[0] + 3 * (word)Y[1] + 3) / 7);
2032						Y[5] = (byte)((3 * (word)Y[0] + 4 * (word)Y[1] + 3) / 7);
2033						Y[6] = (byte)((2 * (word)Y[0] + 5 * (word)Y[1] + 3) / 7);
2034						Y[7] = (byte)((1 * (word)Y[0] + 6 * (word)Y[1] + 3) / 7);
2035					}
2036					else
2037					{
2038						Y[2] = (byte)((4 * (word)Y[0] + 1 * (word)Y[1] + 2) / 5);
2039						Y[3] = (byte)((3 * (word)Y[0] + 2 * (word)Y[1] + 2) / 5);
2040						Y[4] = (byte)((2 * (word)Y[0] + 3 * (word)Y[1] + 2) / 5);
2041						Y[5] = (byte)((1 * (word)Y[0] + 4 * (word)Y[1] + 2) / 5);
2042						Y[6] = 0;
2043						Y[7] = 0xFF;
2044					}
2045
2046					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2047					{
2048						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2049						{
2050							word r = X[(unsigned int)(source->xlut >> (16 + 3 * (i + j * 4))) % 8];
2051							word g = Y[(unsigned int)(source->ylut >> (16 + 3 * (i + j * 4))) % 8];
2052
2053							dest[(x + i) + (y + j) * internal.width] = (g << 8) + r;
2054						}
2055					}
2056
2057					source++;
2058				}
2059			}
2060
2061			(byte*&)destSlice += internal.sliceB;
2062		}
2063	}
2064
2065	void Surface::decodeETC2(Buffer &internal, const Buffer &external, int nbAlphaBits, bool isSRGB)
2066	{
2067		ETC_Decoder::Decode((const byte*)external.buffer, (byte*)internal.buffer, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
2068		                    (nbAlphaBits == 8) ? ETC_Decoder::ETC_RGBA : ((nbAlphaBits == 1) ? ETC_Decoder::ETC_RGB_PUNCHTHROUGH_ALPHA : ETC_Decoder::ETC_RGB));
2069
2070		if(isSRGB)
2071		{
2072			static byte sRGBtoLinearTable[256];
2073			static bool sRGBtoLinearTableDirty = true;
2074			if(sRGBtoLinearTableDirty)
2075			{
2076				for(int i = 0; i < 256; i++)
2077				{
2078					sRGBtoLinearTable[i] = static_cast<byte>(sRGBtoLinear(static_cast<float>(i) / 255.0f) * 255.0f + 0.5f);
2079				}
2080				sRGBtoLinearTableDirty = false;
2081			}
2082
2083			// Perform sRGB conversion in place after decoding
2084			byte* src = (byte*)internal.buffer;
2085			for(int y = 0; y < internal.height; y++)
2086			{
2087				byte* srcRow = src + y * internal.pitchB;
2088				for(int x = 0; x <  internal.width; x++)
2089				{
2090					byte* srcPix = srcRow + x * internal.bytes;
2091					for(int i = 0; i < 3; i++)
2092					{
2093						srcPix[i] = sRGBtoLinearTable[srcPix[i]];
2094					}
2095				}
2096			}
2097		}
2098	}
2099
2100	void Surface::decodeEAC(Buffer &internal, const Buffer &external, int nbChannels, bool isSigned)
2101	{
2102		ASSERT(nbChannels == 1 || nbChannels == 2);
2103
2104		ETC_Decoder::Decode((const byte*)external.buffer, (byte*)internal.buffer, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
2105		                    (nbChannels == 1) ? (isSigned ? ETC_Decoder::ETC_R_SIGNED : ETC_Decoder::ETC_R_UNSIGNED) : (isSigned ? ETC_Decoder::ETC_RG_SIGNED : ETC_Decoder::ETC_RG_UNSIGNED));
2106
2107		// FIXME: We convert signed data to float, until signed integer internal formats are supported
2108		//        This code can be removed if signed ETC2 images are decoded to internal 8 bit signed R/RG formats
2109		if(isSigned)
2110		{
2111			sbyte* src = (sbyte*)internal.buffer;
2112
2113			for(int y = 0; y < internal.height; y++)
2114			{
2115				sbyte* srcRow = src + y * internal.pitchB;
2116				for(int x = internal.width - 1; x >= 0; x--)
2117				{
2118					int dx = x & 0xFFFFFFFC;
2119					int mx = x - dx;
2120					sbyte* srcPix = srcRow + dx * internal.bytes + mx * nbChannels;
2121					float* dstPix = (float*)(srcRow + x * internal.bytes);
2122					for(int c = nbChannels - 1; c >= 0; c--)
2123					{
2124						static const float normalization = 1.0f / 127.875f;
2125						dstPix[c] = clamp(static_cast<float>(srcPix[c]) * normalization, -1.0f, 1.0f);
2126					}
2127				}
2128			}
2129		}
2130	}
2131
2132	void Surface::decodeASTC(Buffer &internal, const Buffer &external, int xBlockSize, int yBlockSize, int zBlockSize, bool isSRGB)
2133	{
2134	}
2135
2136	unsigned int Surface::size(int width, int height, int depth, Format format)
2137	{
2138		// Dimensions rounded up to multiples of 4, used for compressed formats
2139		int width4 = align(width, 4);
2140		int height4 = align(height, 4);
2141
2142		switch(format)
2143		{
2144		#if S3TC_SUPPORT
2145		case FORMAT_DXT1:
2146		#endif
2147		case FORMAT_ATI1:
2148		case FORMAT_ETC1:
2149		case FORMAT_R11_EAC:
2150		case FORMAT_SIGNED_R11_EAC:
2151		case FORMAT_RGB8_ETC2:
2152		case FORMAT_SRGB8_ETC2:
2153		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2154		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2155			return width4 * height4 * depth / 2;
2156		#if S3TC_SUPPORT
2157		case FORMAT_DXT3:
2158		case FORMAT_DXT5:
2159		#endif
2160		case FORMAT_ATI2:
2161		case FORMAT_RG11_EAC:
2162		case FORMAT_SIGNED_RG11_EAC:
2163		case FORMAT_RGBA8_ETC2_EAC:
2164		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
2165		case FORMAT_RGBA_ASTC_4x4_KHR:
2166		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
2167			return width4 * height4 * depth;
2168		case FORMAT_RGBA_ASTC_5x4_KHR:
2169		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
2170			return align(width, 5) * height4 * depth;
2171		case FORMAT_RGBA_ASTC_5x5_KHR:
2172		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
2173			return align(width, 5) * align(height, 5) * depth;
2174		case FORMAT_RGBA_ASTC_6x5_KHR:
2175		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
2176			return align(width, 6) * align(height, 5) * depth;
2177		case FORMAT_RGBA_ASTC_6x6_KHR:
2178		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
2179			return align(width, 6) * align(height, 6) * depth;
2180		case FORMAT_RGBA_ASTC_8x5_KHR:
2181		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
2182			return align(width, 8) * align(height, 5) * depth;
2183		case FORMAT_RGBA_ASTC_8x6_KHR:
2184		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
2185			return align(width, 8) * align(height, 6) * depth;
2186		case FORMAT_RGBA_ASTC_8x8_KHR:
2187		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
2188			return align(width, 8) * align(height, 8) * depth;
2189		case FORMAT_RGBA_ASTC_10x5_KHR:
2190		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
2191			return align(width, 10) * align(height, 5) * depth;
2192		case FORMAT_RGBA_ASTC_10x6_KHR:
2193		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
2194			return align(width, 10) * align(height, 6) * depth;
2195		case FORMAT_RGBA_ASTC_10x8_KHR:
2196		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
2197			return align(width, 10) * align(height, 8) * depth;
2198		case FORMAT_RGBA_ASTC_10x10_KHR:
2199		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
2200			return align(width, 10) * align(height, 10) * depth;
2201		case FORMAT_RGBA_ASTC_12x10_KHR:
2202		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
2203			return align(width, 12) * align(height, 10) * depth;
2204		case FORMAT_RGBA_ASTC_12x12_KHR:
2205		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
2206			return align(width, 12) * align(height, 12) * depth;
2207		case FORMAT_YV12_BT601:
2208		case FORMAT_YV12_BT709:
2209		case FORMAT_YV12_JFIF:
2210			{
2211				unsigned int YStride = align(width, 16);
2212				unsigned int YSize = YStride * height;
2213				unsigned int CStride = align(YStride / 2, 16);
2214 				unsigned int CSize = CStride * height / 2;
2215
2216				return YSize + 2 * CSize;
2217			}
2218		default:
2219			return bytes(format) * width * height * depth;
2220		}
2221
2222		return 0;
2223	}
2224
2225	bool Surface::isStencil(Format format)
2226	{
2227		switch(format)
2228		{
2229		case FORMAT_D32:
2230		case FORMAT_D16:
2231		case FORMAT_D24X8:
2232		case FORMAT_D32F:
2233		case FORMAT_D32F_COMPLEMENTARY:
2234		case FORMAT_D32F_LOCKABLE:
2235			return false;
2236		case FORMAT_D24S8:
2237		case FORMAT_D24FS8:
2238		case FORMAT_S8:
2239		case FORMAT_DF24S8:
2240		case FORMAT_DF16S8:
2241		case FORMAT_D32FS8_TEXTURE:
2242		case FORMAT_D32FS8_SHADOW:
2243		case FORMAT_INTZ:
2244			return true;
2245		default:
2246			return false;
2247		}
2248	}
2249
2250	bool Surface::isDepth(Format format)
2251	{
2252		switch(format)
2253		{
2254		case FORMAT_D32:
2255		case FORMAT_D16:
2256		case FORMAT_D24X8:
2257		case FORMAT_D24S8:
2258		case FORMAT_D24FS8:
2259		case FORMAT_D32F:
2260		case FORMAT_D32F_COMPLEMENTARY:
2261		case FORMAT_D32F_LOCKABLE:
2262		case FORMAT_DF24S8:
2263		case FORMAT_DF16S8:
2264		case FORMAT_D32FS8_TEXTURE:
2265		case FORMAT_D32FS8_SHADOW:
2266		case FORMAT_INTZ:
2267			return true;
2268		case FORMAT_S8:
2269			return false;
2270		default:
2271			return false;
2272		}
2273	}
2274
2275	bool Surface::isPalette(Format format)
2276	{
2277		switch(format)
2278		{
2279		case FORMAT_P8:
2280		case FORMAT_A8P8:
2281			return true;
2282		default:
2283			return false;
2284		}
2285	}
2286
2287	bool Surface::isFloatFormat(Format format)
2288	{
2289		switch(format)
2290		{
2291		case FORMAT_R5G6B5:
2292		case FORMAT_X8R8G8B8:
2293		case FORMAT_X8B8G8R8I:
2294		case FORMAT_X8B8G8R8:
2295		case FORMAT_A8R8G8B8:
2296		case FORMAT_A8B8G8R8I:
2297		case FORMAT_R8UI:
2298		case FORMAT_G8R8UI:
2299		case FORMAT_X8B8G8R8UI:
2300		case FORMAT_A8B8G8R8UI:
2301		case FORMAT_A8B8G8R8:
2302		case FORMAT_G8R8I:
2303		case FORMAT_G8R8:
2304		case FORMAT_R8I_SNORM:
2305		case FORMAT_G8R8I_SNORM:
2306		case FORMAT_X8B8G8R8I_SNORM:
2307		case FORMAT_A8B8G8R8I_SNORM:
2308		case FORMAT_R16I:
2309		case FORMAT_R16UI:
2310		case FORMAT_G16R16I:
2311		case FORMAT_G16R16UI:
2312		case FORMAT_G16R16:
2313		case FORMAT_X16B16G16R16I:
2314		case FORMAT_X16B16G16R16UI:
2315		case FORMAT_A16B16G16R16I:
2316		case FORMAT_A16B16G16R16UI:
2317		case FORMAT_A16B16G16R16:
2318		case FORMAT_V8U8:
2319		case FORMAT_Q8W8V8U8:
2320		case FORMAT_X8L8V8U8:
2321		case FORMAT_V16U16:
2322		case FORMAT_A16W16V16U16:
2323		case FORMAT_Q16W16V16U16:
2324		case FORMAT_A8:
2325		case FORMAT_R8I:
2326		case FORMAT_R8:
2327		case FORMAT_L8:
2328		case FORMAT_L16:
2329		case FORMAT_A8L8:
2330		case FORMAT_YV12_BT601:
2331		case FORMAT_YV12_BT709:
2332		case FORMAT_YV12_JFIF:
2333		case FORMAT_R32I:
2334		case FORMAT_R32UI:
2335		case FORMAT_G32R32I:
2336		case FORMAT_G32R32UI:
2337		case FORMAT_X32B32G32R32I:
2338		case FORMAT_X32B32G32R32UI:
2339		case FORMAT_A32B32G32R32I:
2340		case FORMAT_A32B32G32R32UI:
2341			return false;
2342		case FORMAT_R32F:
2343		case FORMAT_G32R32F:
2344		case FORMAT_A32B32G32R32F:
2345		case FORMAT_D32F:
2346		case FORMAT_D32F_COMPLEMENTARY:
2347		case FORMAT_D32F_LOCKABLE:
2348		case FORMAT_D32FS8_TEXTURE:
2349		case FORMAT_D32FS8_SHADOW:
2350		case FORMAT_L16F:
2351		case FORMAT_A16L16F:
2352		case FORMAT_L32F:
2353		case FORMAT_A32L32F:
2354			return true;
2355		default:
2356			ASSERT(false);
2357		}
2358
2359		return false;
2360	}
2361
2362	bool Surface::isUnsignedComponent(Format format, int component)
2363	{
2364		switch(format)
2365		{
2366		case FORMAT_NULL:
2367		case FORMAT_R5G6B5:
2368		case FORMAT_X8R8G8B8:
2369		case FORMAT_X8B8G8R8:
2370		case FORMAT_A8R8G8B8:
2371		case FORMAT_A8B8G8R8:
2372		case FORMAT_G8R8:
2373		case FORMAT_G16R16:
2374		case FORMAT_A16B16G16R16:
2375		case FORMAT_D32F:
2376		case FORMAT_D32F_COMPLEMENTARY:
2377		case FORMAT_D32F_LOCKABLE:
2378		case FORMAT_D32FS8_TEXTURE:
2379		case FORMAT_D32FS8_SHADOW:
2380		case FORMAT_A8:
2381		case FORMAT_R8:
2382		case FORMAT_L8:
2383		case FORMAT_L16:
2384		case FORMAT_A8L8:
2385		case FORMAT_YV12_BT601:
2386		case FORMAT_YV12_BT709:
2387		case FORMAT_YV12_JFIF:
2388			return true;
2389		case FORMAT_V8U8:
2390		case FORMAT_X8L8V8U8:
2391		case FORMAT_V16U16:
2392			if(component < 2)
2393			{
2394				return false;
2395			}
2396			else
2397			{
2398				return true;
2399			}
2400		case FORMAT_A16W16V16U16:
2401			if(component < 3)
2402			{
2403				return false;
2404			}
2405			else
2406			{
2407				return true;
2408			}
2409		case FORMAT_Q8W8V8U8:
2410		case FORMAT_Q16W16V16U16:
2411			return false;
2412		case FORMAT_R32F:
2413			if(component < 1)
2414			{
2415				return false;
2416			}
2417			else
2418			{
2419				return true;
2420			}
2421		case FORMAT_G32R32F:
2422			if(component < 2)
2423			{
2424				return false;
2425			}
2426			else
2427			{
2428				return true;
2429			}
2430		case FORMAT_A32B32G32R32F:
2431			return false;
2432		default:
2433			ASSERT(false);
2434		}
2435
2436		return false;
2437	}
2438
2439	bool Surface::isSRGBreadable(Format format)
2440	{
2441		// Keep in sync with Capabilities::isSRGBreadable
2442		switch(format)
2443		{
2444		case FORMAT_L8:
2445		case FORMAT_A8L8:
2446		case FORMAT_R8G8B8:
2447		case FORMAT_A8R8G8B8:
2448		case FORMAT_X8R8G8B8:
2449		case FORMAT_A8B8G8R8:
2450		case FORMAT_X8B8G8R8:
2451		case FORMAT_R5G6B5:
2452		case FORMAT_X1R5G5B5:
2453		case FORMAT_A1R5G5B5:
2454		case FORMAT_A4R4G4B4:
2455		#if S3TC_SUPPORT
2456		case FORMAT_DXT1:
2457		case FORMAT_DXT3:
2458		case FORMAT_DXT5:
2459		#endif
2460		case FORMAT_ATI1:
2461		case FORMAT_ATI2:
2462			return true;
2463		default:
2464			return false;
2465		}
2466
2467		return false;
2468	}
2469
2470	bool Surface::isSRGBwritable(Format format)
2471	{
2472		// Keep in sync with Capabilities::isSRGBwritable
2473		switch(format)
2474		{
2475		case FORMAT_NULL:
2476		case FORMAT_A8R8G8B8:
2477		case FORMAT_X8R8G8B8:
2478		case FORMAT_A8B8G8R8:
2479		case FORMAT_X8B8G8R8:
2480		case FORMAT_R5G6B5:
2481			return true;
2482		default:
2483			return false;
2484		}
2485	}
2486
2487	bool Surface::isCompressed(Format format)
2488	{
2489		switch(format)
2490		{
2491		#if S3TC_SUPPORT
2492		case FORMAT_DXT1:
2493		case FORMAT_DXT3:
2494		case FORMAT_DXT5:
2495		#endif
2496		case FORMAT_ATI1:
2497		case FORMAT_ATI2:
2498		case FORMAT_ETC1:
2499		case FORMAT_R11_EAC:
2500		case FORMAT_SIGNED_R11_EAC:
2501		case FORMAT_RG11_EAC:
2502		case FORMAT_SIGNED_RG11_EAC:
2503		case FORMAT_RGB8_ETC2:
2504		case FORMAT_SRGB8_ETC2:
2505		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2506		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2507		case FORMAT_RGBA8_ETC2_EAC:
2508		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
2509		case FORMAT_RGBA_ASTC_4x4_KHR:
2510		case FORMAT_RGBA_ASTC_5x4_KHR:
2511		case FORMAT_RGBA_ASTC_5x5_KHR:
2512		case FORMAT_RGBA_ASTC_6x5_KHR:
2513		case FORMAT_RGBA_ASTC_6x6_KHR:
2514		case FORMAT_RGBA_ASTC_8x5_KHR:
2515		case FORMAT_RGBA_ASTC_8x6_KHR:
2516		case FORMAT_RGBA_ASTC_8x8_KHR:
2517		case FORMAT_RGBA_ASTC_10x5_KHR:
2518		case FORMAT_RGBA_ASTC_10x6_KHR:
2519		case FORMAT_RGBA_ASTC_10x8_KHR:
2520		case FORMAT_RGBA_ASTC_10x10_KHR:
2521		case FORMAT_RGBA_ASTC_12x10_KHR:
2522		case FORMAT_RGBA_ASTC_12x12_KHR:
2523		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
2524		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
2525		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
2526		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
2527		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
2528		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
2529		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
2530		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
2531		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
2532		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
2533		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
2534		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
2535		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
2536		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
2537			return true;
2538		default:
2539			return false;
2540		}
2541	}
2542
2543	bool Surface::isNonNormalizedInteger(Format format)
2544	{
2545		switch(format)
2546		{
2547		case FORMAT_A8B8G8R8I:
2548		case FORMAT_X8B8G8R8I:
2549		case FORMAT_G8R8I:
2550		case FORMAT_R8I:
2551		case FORMAT_A8B8G8R8UI:
2552		case FORMAT_X8B8G8R8UI:
2553		case FORMAT_G8R8UI:
2554		case FORMAT_R8UI:
2555		case FORMAT_A16B16G16R16I:
2556		case FORMAT_X16B16G16R16I:
2557		case FORMAT_G16R16I:
2558		case FORMAT_R16I:
2559		case FORMAT_A16B16G16R16UI:
2560		case FORMAT_X16B16G16R16UI:
2561		case FORMAT_G16R16UI:
2562		case FORMAT_R16UI:
2563		case FORMAT_A32B32G32R32I:
2564		case FORMAT_X32B32G32R32I:
2565		case FORMAT_G32R32I:
2566		case FORMAT_R32I:
2567		case FORMAT_A32B32G32R32UI:
2568		case FORMAT_X32B32G32R32UI:
2569		case FORMAT_G32R32UI:
2570		case FORMAT_R32UI:
2571			return true;
2572		default:
2573			return false;
2574		}
2575	}
2576
2577	int Surface::componentCount(Format format)
2578	{
2579		switch(format)
2580		{
2581		case FORMAT_R5G6B5:         return 3;
2582		case FORMAT_X8R8G8B8:       return 3;
2583		case FORMAT_X8B8G8R8:       return 3;
2584		case FORMAT_A8R8G8B8:       return 4;
2585		case FORMAT_A8B8G8R8:       return 4;
2586		case FORMAT_G8R8:           return 2;
2587		case FORMAT_G16R16:         return 2;
2588		case FORMAT_A16B16G16R16:   return 4;
2589		case FORMAT_V8U8:           return 2;
2590		case FORMAT_Q8W8V8U8:       return 4;
2591		case FORMAT_X8L8V8U8:       return 3;
2592		case FORMAT_V16U16:         return 2;
2593		case FORMAT_A16W16V16U16:   return 4;
2594		case FORMAT_Q16W16V16U16:   return 4;
2595		case FORMAT_R32F:           return 1;
2596		case FORMAT_G32R32F:        return 2;
2597		case FORMAT_A32B32G32R32F:  return 4;
2598		case FORMAT_D32F_LOCKABLE:  return 1;
2599		case FORMAT_D32FS8_TEXTURE: return 1;
2600		case FORMAT_D32FS8_SHADOW:  return 1;
2601		case FORMAT_A8:             return 1;
2602		case FORMAT_R8:             return 1;
2603		case FORMAT_L8:             return 1;
2604		case FORMAT_L16:            return 1;
2605		case FORMAT_A8L8:           return 2;
2606		case FORMAT_YV12_BT601:     return 3;
2607		case FORMAT_YV12_BT709:     return 3;
2608		case FORMAT_YV12_JFIF:      return 3;
2609		default:
2610			ASSERT(false);
2611		}
2612
2613		return 1;
2614	}
2615
2616	void *Surface::allocateBuffer(int width, int height, int depth, Format format)
2617	{
2618		// Render targets require 2x2 quads
2619		int width2 = (width + 1) & ~1;
2620		int height2 = (height + 1) & ~1;
2621
2622		// FIXME: Unpacking byte4 to short4 in the sampler currently involves reading 8 bytes,
2623		// so we have to allocate 4 extra bytes to avoid buffer overruns.
2624		return allocateZero(size(width2, height2, depth, format) + 4);
2625	}
2626
2627	void Surface::memfill4(void *buffer, int pattern, int bytes)
2628	{
2629		while((size_t)buffer & 0x1 && bytes >= 1)
2630		{
2631			*(char*)buffer = (char)pattern;
2632			(char*&)buffer += 1;
2633			bytes -= 1;
2634		}
2635
2636		while((size_t)buffer & 0x3 && bytes >= 2)
2637		{
2638			*(short*)buffer = (short)pattern;
2639			(short*&)buffer += 1;
2640			bytes -= 2;
2641		}
2642
2643		if(CPUID::supportsSSE())
2644		{
2645			while((size_t)buffer & 0xF && bytes >= 4)
2646			{
2647				*(int*)buffer = pattern;
2648				(int*&)buffer += 1;
2649				bytes -= 4;
2650			}
2651
2652			__m128 quad = _mm_set_ps1((float&)pattern);
2653
2654			float *pointer = (float*)buffer;
2655			int qxwords = bytes / 64;
2656			bytes -= qxwords * 64;
2657
2658			while(qxwords--)
2659			{
2660				_mm_stream_ps(pointer + 0, quad);
2661				_mm_stream_ps(pointer + 4, quad);
2662				_mm_stream_ps(pointer + 8, quad);
2663				_mm_stream_ps(pointer + 12, quad);
2664
2665				pointer += 16;
2666			}
2667
2668			buffer = pointer;
2669		}
2670
2671		while(bytes >= 4)
2672		{
2673			*(int*)buffer = (int)pattern;
2674			(int*&)buffer += 1;
2675			bytes -= 4;
2676		}
2677
2678		while(bytes >= 2)
2679		{
2680			*(short*)buffer = (short)pattern;
2681			(short*&)buffer += 1;
2682			bytes -= 2;
2683		}
2684
2685		while(bytes >= 1)
2686		{
2687			*(char*)buffer = (char)pattern;
2688			(char*&)buffer += 1;
2689			bytes -= 1;
2690		}
2691	}
2692
2693	void Surface::clearColorBuffer(float red, float green, float blue, float alpha, unsigned int rgbaMask, int x0, int y0, int width, int height)
2694	{
2695		// FIXME: Also clear buffers in other formats?
2696
2697		// Not overlapping
2698		if(x0 > internal.width) return;
2699		if(y0 > internal.height) return;
2700		if(x0 + width < 0) return;
2701		if(y0 + height < 0) return;
2702
2703		// Clip against dimensions
2704		if(x0 < 0) {width += x0; x0 = 0;}
2705		if(x0 + width > internal.width) width = internal.width - x0;
2706		if(y0 < 0) {height += y0; y0 = 0;}
2707		if(y0 + height > internal.height) height = internal.height - y0;
2708
2709		const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
2710		const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
2711
2712		int x1 = x0 + width;
2713		int y1 = y0 + height;
2714
2715	//	if(lockable || !quadLayoutEnabled)
2716		{
2717			unsigned char *buffer = (unsigned char*)lockInternal(x0, y0, 0, lock, PUBLIC);
2718
2719			for(int z = 0; z < internal.depth; z++)
2720			{
2721				unsigned char *target = buffer;
2722
2723				for(int y = y0; y < y1; y++)
2724				{
2725					switch(internal.format)
2726					{
2727					case FORMAT_NULL:
2728						break;
2729					case FORMAT_X8R8G8B8:
2730					case FORMAT_A8R8G8B8:
2731				//	case FORMAT_X8G8R8B8Q:   // FIXME
2732				//	case FORMAT_A8G8R8B8Q:   // FIXME
2733						{
2734							unsigned char r8 = iround(red * 0xFF);
2735							unsigned char g8 = iround(green * 0xFF);
2736							unsigned char b8 = iround(blue * 0xFF);
2737							unsigned char a8 = iround(alpha * 0xFF);
2738							unsigned char a8r8g8b8[4] = {b8, g8, r8, a8};
2739							unsigned int colorARGB = (unsigned int&)a8r8g8b8;
2740
2741							if(rgbaMask == 0xF || (internal.format == FORMAT_X8R8G8B8 && rgbaMask == 0x7))
2742							{
2743								memfill4(target, colorARGB, 4 * (x1 - x0));
2744							}
2745							else
2746							{
2747								unsigned int bgraMask = (rgbaMask & 0x1 ? 0x00FF0000 : 0) | (rgbaMask & 0x2 ? 0x0000FF00 : 0) | (rgbaMask & 0x4 ? 0x000000FF : 0) | (rgbaMask & 0x8 ? 0xFF000000 : 0);
2748								unsigned int invMask = ~bgraMask;
2749								unsigned int maskedColor = colorARGB & bgraMask;
2750								unsigned int *target32 = (unsigned int*)target;
2751
2752								for(int x = 0; x < width; x++)
2753								{
2754									target32[x] = maskedColor | (target32[x] & invMask);
2755								}
2756							}
2757						}
2758						break;
2759					case FORMAT_X8B8G8R8:
2760					case FORMAT_A8B8G8R8:
2761						{
2762							unsigned char r8 = iround(red * 0xFF);
2763							unsigned char g8 = iround(green * 0xFF);
2764							unsigned char b8 = iround(blue * 0xFF);
2765							unsigned char a8 = iround(alpha * 0xFF);
2766							unsigned char a8b8g8r8[4] = {r8, g8, b8, a8};
2767							unsigned int colorABGR = (unsigned int&)a8b8g8r8;
2768
2769							if(rgbaMask == 0xF || (internal.format == FORMAT_X8B8G8R8 && rgbaMask == 0x7))
2770							{
2771								memfill4(target, colorABGR, 4 * (x1 - x0));
2772							}
2773							else
2774							{
2775								unsigned int rgbaMask32 = (rgbaMask & 0x1 ? 0x000000FF : 0) | (rgbaMask & 0x2 ? 0x0000FF00 : 0) | (rgbaMask & 0x4 ? 0x00FF0000 : 0) | (rgbaMask & 0x8 ? 0xFF000000 : 0);
2776								unsigned int invMask = ~rgbaMask32;
2777								unsigned int maskedColor = colorABGR & rgbaMask32;
2778 								unsigned int *target32 = (unsigned int*)target;
2779
2780								for(int x = 0; x < width; x++)
2781								{
2782									target32[x] = maskedColor | (target32[x] & invMask);
2783								}
2784							}
2785						}
2786						break;
2787					case FORMAT_G8R8:
2788						{
2789							unsigned char r8 = iround(red * 0xFF);
2790							unsigned char g8 = iround(green * 0xFF);
2791							unsigned char g8r8[4] = {r8, g8, r8, g8};
2792
2793							if((rgbaMask & 0x3) == 0x3)
2794							{
2795								memfill4(target, (int&)g8r8, 2 * (x1 - x0));
2796							}
2797							else
2798							{
2799								unsigned short rgMask = (rgbaMask & 0x1 ? 0x000000FF : 0) | (rgbaMask & 0x2 ? 0x0000FF00 : 0);
2800								unsigned short invMask = ~rgMask;
2801								unsigned short maskedColor = (unsigned short&)g8r8 & rgMask;
2802								unsigned short *target16 = (unsigned short*)target;
2803
2804								for(int x = 0; x < width; x++)
2805								{
2806									target16[x] = maskedColor | (target16[x] & invMask);
2807								}
2808							}
2809						}
2810						break;
2811					case FORMAT_G16R16:
2812						{
2813							unsigned char r16 = iround(red * 0xFFFF);
2814							unsigned char g16 = iround(green * 0xFFFF);
2815							unsigned short g16r16[2] = {r16, g16};
2816
2817							if((rgbaMask & 0x3) == 0x3)
2818							{
2819								memfill4(target, (int&)g16r16, 4 * (x1 - x0));
2820							}
2821							else
2822							{
2823								unsigned int rgMask = (rgbaMask & 0x1 ? 0x0000FFFF : 0) | (rgbaMask & 0x2 ? 0xFFFF0000 : 0);
2824								unsigned int invMask = ~rgMask;
2825								unsigned int maskedColor = (unsigned int&)g16r16 & rgMask;
2826								unsigned int *target32 = (unsigned int*)target;
2827
2828								for(int x = 0; x < width; x++)
2829								{
2830									target32[x] = maskedColor | (target32[x] & invMask);
2831								}
2832							}
2833						}
2834						break;
2835					case FORMAT_A16B16G16R16:
2836						{
2837							unsigned char r16 = iround(red * 0xFFFF);
2838							unsigned char g16 = iround(green * 0xFFFF);
2839							unsigned char b16 = iround(blue * 0xFFFF);
2840							unsigned char a16 = iround(alpha * 0xFFFF);
2841
2842							if(rgbaMask == 0xF)
2843							{
2844								for(int x = 0; x < width; x++)
2845								{
2846									((unsigned short*)target)[4 * x + 0] = r16;
2847									((unsigned short*)target)[4 * x + 1] = g16;
2848									((unsigned short*)target)[4 * x + 2] = b16;
2849									((unsigned short*)target)[4 * x + 3] = a16;
2850								}
2851							}
2852							else
2853							{
2854								if(rgbaMask & 0x1) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 0] = r16;
2855								if(rgbaMask & 0x2) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 1] = g16;
2856								if(rgbaMask & 0x4) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 2] = b16;
2857								if(rgbaMask & 0x8) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 3] = a16;
2858							}
2859						}
2860						break;
2861					case FORMAT_R32F:
2862						if(rgbaMask & 0x1)
2863						{
2864							for(int x = 0; x < width; x++)
2865							{
2866								((float*)target)[x] = red;
2867							}
2868						}
2869						break;
2870					case FORMAT_G32R32F:
2871						if((rgbaMask & 0x3) == 0x3)
2872						{
2873							for(int x = 0; x < width; x++)
2874							{
2875								((float*)target)[2 * x + 0] = red;
2876								((float*)target)[2 * x + 1] = green;
2877							}
2878						}
2879						else
2880						{
2881							if(rgbaMask & 0x1) for(int x = 0; x < width; x++) ((float*)target)[2 * x + 0] = red;
2882							if(rgbaMask & 0x2) for(int x = 0; x < width; x++) ((float*)target)[2 * x + 1] = green;
2883						}
2884						break;
2885					case FORMAT_A32B32G32R32F:
2886						if(rgbaMask == 0xF)
2887						{
2888							for(int x = 0; x < width; x++)
2889							{
2890								((float*)target)[4 * x + 0] = red;
2891								((float*)target)[4 * x + 1] = green;
2892								((float*)target)[4 * x + 2] = blue;
2893								((float*)target)[4 * x + 3] = alpha;
2894							}
2895						}
2896						else
2897						{
2898							if(rgbaMask & 0x1) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 0] = red;
2899							if(rgbaMask & 0x2) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 1] = green;
2900							if(rgbaMask & 0x4) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 2] = blue;
2901							if(rgbaMask & 0x8) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 3] = alpha;
2902						}
2903						break;
2904					case FORMAT_R5G6B5:
2905						{
2906							unsigned int r5 = iround(red * 0x1F);
2907							unsigned int g6 = iround(green * 0x3F);
2908							unsigned int b5 = iround(blue * 0x1F);
2909							unsigned int r5g6b5 = (r5 << 11) | (g6 << 5) | b5;
2910
2911							if((rgbaMask & 0x7) == 0x7)
2912							{
2913								unsigned int r5g6b5r5g6b5 = r5g6b5 | (r5g6b5 << 16);
2914								memfill4(target, r5g6b5r5g6b5, 2 * (x1 - x0));
2915							}
2916							else
2917							{
2918								unsigned short rgbMask = (rgbaMask & 0x1 ? 0xF800 : 0) | (rgbaMask & 0x2 ? 0x07E0 : 0) | (rgbaMask & 0x3 ? 0x001F : 0);
2919								unsigned short invMask = ~rgbMask;
2920								unsigned short maskedColor = r5g6b5 & rgbMask;
2921								unsigned short *target16 = (unsigned short*)target;
2922
2923								for(int x = 0; x < width; x++)
2924								{
2925									target16[x] = maskedColor | (target16[x] & invMask);
2926								}
2927							}
2928						}
2929						break;
2930					default:
2931						ASSERT(false);
2932					}
2933
2934					target += internal.pitchB;
2935				}
2936
2937				buffer += internal.sliceB;
2938			}
2939
2940			unlockInternal();
2941		}
2942	/*	else
2943		{
2944			int width2 = (internal.width + 1) & ~1;
2945
2946		//	unsigned char *target = (unsigned char*&)buffer;
2947		//
2948		//	for(int y = y0; y < y1; y++)
2949		//	{
2950		//		for(int x = x0; x < x1; x++)
2951		//		{
2952		//			target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 0] =  (color & 0x000000FF) >> 0;
2953		//			target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 4] =  (color & 0x00FF0000) >> 16;
2954		//			target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 8] =  (color & 0x0000FF00) >> 8;
2955		//			target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 12] = (color & 0xFF000000) >> 24;
2956		//		}
2957		//	}
2958
2959			unsigned char colorQ[16];
2960
2961			colorQ[0] =  (color & 0x000000FF) >> 0;
2962			colorQ[1] =  (color & 0x000000FF) >> 0;
2963			colorQ[2] =  (color & 0x000000FF) >> 0;
2964			colorQ[3] =  (color & 0x000000FF) >> 0;
2965			colorQ[4] =  (color & 0x00FF0000) >> 16;
2966			colorQ[5] =  (color & 0x00FF0000) >> 16;
2967			colorQ[6] =  (color & 0x00FF0000) >> 16;
2968			colorQ[7] =  (color & 0x00FF0000) >> 16;
2969			colorQ[8] =  (color & 0x0000FF00) >> 8;
2970			colorQ[9] =  (color & 0x0000FF00) >> 8;
2971			colorQ[10] = (color & 0x0000FF00) >> 8;
2972			colorQ[11] = (color & 0x0000FF00) >> 8;
2973			colorQ[12] = (color & 0xFF000000) >> 24;
2974			colorQ[13] = (color & 0xFF000000) >> 24;
2975			colorQ[14] = (color & 0xFF000000) >> 24;
2976			colorQ[15] = (color & 0xFF000000) >> 24;
2977
2978			for(int y = y0; y < y1; y++)
2979			{
2980				unsigned char *target = (unsigned char*)lockInternal(0, 0, 0, lock) + width2 * 4 * (y & ~1) + 2 * (y & 1);   // FIXME: Unlock
2981
2982				if((y & 1) == 0 && y + 1 < y1)   // Fill quad line at once
2983				{
2984					if((x0 & 1) != 0)
2985					{
2986						target[8 * (x0 & ~1) + 1 + 0] =  (color & 0x000000FF) >> 0;
2987						target[8 * (x0 & ~1) + 1 + 4] =  (color & 0x00FF0000) >> 16;
2988						target[8 * (x0 & ~1) + 1 + 8] =  (color & 0x0000FF00) >> 8;
2989						target[8 * (x0 & ~1) + 1 + 12] = (color & 0xFF000000) >> 24;
2990
2991						target[8 * (x0 & ~1) + 3 + 0] =  (color & 0x000000FF) >> 0;
2992						target[8 * (x0 & ~1) + 3 + 4] =  (color & 0x00FF0000) >> 16;
2993						target[8 * (x0 & ~1) + 3 + 8] =  (color & 0x0000FF00) >> 8;
2994						target[8 * (x0 & ~1) + 3 + 12] = (color & 0xFF000000) >> 24;
2995					}
2996
2997					__asm
2998					{
2999						movq mm0, colorQ+0
3000						movq mm1, colorQ+8
3001
3002						mov eax, x0
3003						add eax, 1
3004						and eax, 0xFFFFFFFE
3005						cmp eax, x1
3006						jge qEnd
3007
3008						mov edi, target
3009
3010					qLoop:
3011						movntq [edi+8*eax+0], mm0
3012						movntq [edi+8*eax+8], mm1
3013
3014						add eax, 2
3015						cmp eax, x1
3016						jl qLoop
3017					qEnd:
3018						emms
3019					}
3020
3021					if((x1 & 1) != 0)
3022					{
3023						target[8 * (x1 & ~1) + 0 + 0] =  (color & 0x000000FF) >> 0;
3024						target[8 * (x1 & ~1) + 0 + 4] =  (color & 0x00FF0000) >> 16;
3025						target[8 * (x1 & ~1) + 0 + 8] =  (color & 0x0000FF00) >> 8;
3026						target[8 * (x1 & ~1) + 0 + 12] = (color & 0xFF000000) >> 24;
3027
3028						target[8 * (x1 & ~1) + 2 + 0] =  (color & 0x000000FF) >> 0;
3029						target[8 * (x1 & ~1) + 2 + 4] =  (color & 0x00FF0000) >> 16;
3030						target[8 * (x1 & ~1) + 2 + 8] =  (color & 0x0000FF00) >> 8;
3031						target[8 * (x1 & ~1) + 2 + 12] = (color & 0xFF000000) >> 24;
3032					}
3033
3034					y++;
3035				}
3036				else
3037				{
3038					for(int x = x0; x < x1; x++)
3039					{
3040						target[8 * (x & ~1) + (x & 1) + 0] =  (color & 0x000000FF) >> 0;
3041						target[8 * (x & ~1) + (x & 1) + 4] =  (color & 0x00FF0000) >> 16;
3042						target[8 * (x & ~1) + (x & 1) + 8] =  (color & 0x0000FF00) >> 8;
3043						target[8 * (x & ~1) + (x & 1) + 12] = (color & 0xFF000000) >> 24;
3044					}
3045				}
3046			}
3047		}*/
3048	}
3049
3050	void Surface::clearDepthBuffer(float depth, int x0, int y0, int width, int height)
3051	{
3052		// Not overlapping
3053		if(x0 > internal.width) return;
3054		if(y0 > internal.height) return;
3055		if(x0 + width < 0) return;
3056		if(y0 + height < 0) return;
3057
3058		// Clip against dimensions
3059		if(x0 < 0) {width += x0; x0 = 0;}
3060		if(x0 + width > internal.width) width = internal.width - x0;
3061		if(y0 < 0) {height += y0; y0 = 0;}
3062		if(y0 + height > internal.height) height = internal.height - y0;
3063
3064		const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
3065		const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
3066
3067		int width2 = (internal.width + 1) & ~1;
3068
3069		int x1 = x0 + width;
3070		int y1 = y0 + height;
3071
3072		if(internal.format == FORMAT_D32F_LOCKABLE ||
3073		   internal.format == FORMAT_D32FS8_TEXTURE ||
3074		   internal.format == FORMAT_D32FS8_SHADOW)
3075		{
3076			float *target = (float*)lockInternal(0, 0, 0, lock, PUBLIC) + x0 + width2 * y0;
3077
3078			for(int z = 0; z < internal.depth; z++)
3079			{
3080				for(int y = y0; y < y1; y++)
3081				{
3082					memfill4(target, (int&)depth, 4 * width);
3083					target += width2;
3084				}
3085			}
3086
3087			unlockInternal();
3088		}
3089		else   // Quad layout
3090		{
3091			if(complementaryDepthBuffer)
3092			{
3093				depth = 1 - depth;
3094			}
3095
3096			float *buffer = (float*)lockInternal(0, 0, 0, lock, PUBLIC);
3097
3098			for(int z = 0; z < internal.depth; z++)
3099			{
3100				for(int y = y0; y < y1; y++)
3101				{
3102					float *target = buffer + (y & ~1) * width2 + (y & 1) * 2;
3103
3104					if((y & 1) == 0 && y + 1 < y1)   // Fill quad line at once
3105					{
3106						if((x0 & 1) != 0)
3107						{
3108							target[(x0 & ~1) * 2 + 1] = depth;
3109							target[(x0 & ~1) * 2 + 3] = depth;
3110						}
3111
3112					//	for(int x2 = ((x0 + 1) & ~1) * 2; x2 < x1 * 2; x2 += 4)
3113					//	{
3114					//		target[x2 + 0] = depth;
3115					//		target[x2 + 1] = depth;
3116					//		target[x2 + 2] = depth;
3117					//		target[x2 + 3] = depth;
3118					//	}
3119
3120					//	__asm
3121					//	{
3122					//		movss xmm0, depth
3123					//		shufps xmm0, xmm0, 0x00
3124					//
3125					//		mov eax, x0
3126					//		add eax, 1
3127					//		and eax, 0xFFFFFFFE
3128					//		cmp eax, x1
3129					//		jge qEnd
3130					//
3131					//		mov edi, target
3132					//
3133					//	qLoop:
3134					//		movntps [edi+8*eax], xmm0
3135					//
3136					//		add eax, 2
3137					//		cmp eax, x1
3138					//		jl qLoop
3139					//	qEnd:
3140					//	}
3141
3142						memfill4(&target[((x0 + 1) & ~1) * 2], (int&)depth, 8 * ((x1 & ~1) - ((x0 + 1) & ~1)));
3143
3144						if((x1 & 1) != 0)
3145						{
3146							target[(x1 & ~1) * 2 + 0] = depth;
3147							target[(x1 & ~1) * 2 + 2] = depth;
3148						}
3149
3150						y++;
3151					}
3152					else
3153					{
3154						for(int x = x0; x < x1; x++)
3155						{
3156							target[(x & ~1) * 2 + (x & 1)] = depth;
3157						}
3158					}
3159				}
3160
3161				buffer += internal.sliceP;
3162			}
3163
3164			unlockInternal();
3165		}
3166	}
3167
3168	void Surface::clearStencilBuffer(unsigned char s, unsigned char mask, int x0, int y0, int width, int height)
3169	{
3170		// Not overlapping
3171		if(x0 > internal.width) return;
3172		if(y0 > internal.height) return;
3173		if(x0 + width < 0) return;
3174		if(y0 + height < 0) return;
3175
3176		// Clip against dimensions
3177		if(x0 < 0) {width += x0; x0 = 0;}
3178		if(x0 + width > internal.width) width = internal.width - x0;
3179		if(y0 < 0) {height += y0; y0 = 0;}
3180		if(y0 + height > internal.height) height = internal.height - y0;
3181
3182		int width2 = (internal.width + 1) & ~1;
3183
3184		int x1 = x0 + width;
3185		int y1 = y0 + height;
3186
3187		unsigned char maskedS = s & mask;
3188		unsigned char invMask = ~mask;
3189		unsigned int fill = maskedS;
3190		fill = fill | (fill << 8) | (fill << 16) + (fill << 24);
3191
3192		if(false)
3193		{
3194			char *target = (char*)lockStencil(0, PUBLIC) + x0 + width2 * y0;
3195
3196			for(int z = 0; z < stencil.depth; z++)
3197			{
3198				for(int y = y0; y < y0 + height; y++)
3199				{
3200					if(mask == 0xFF)
3201					{
3202						memfill4(target, fill, width);
3203					}
3204					else
3205					{
3206						for(int x = 0; x < width; x++)
3207						{
3208							target[x] = maskedS | (target[x] & invMask);
3209						}
3210					}
3211
3212					target += width2;
3213				}
3214			}
3215
3216			unlockStencil();
3217		}
3218		else   // Quad layout
3219		{
3220			char *buffer = (char*)lockStencil(0, PUBLIC);
3221
3222			if(mask == 0xFF)
3223			{
3224				for(int z = 0; z < stencil.depth; z++)
3225				{
3226					for(int y = y0; y < y1; y++)
3227					{
3228						char *target = buffer + (y & ~1) * width2 + (y & 1) * 2;
3229
3230						if((y & 1) == 0 && y + 1 < y1 && mask == 0xFF)   // Fill quad line at once
3231						{
3232							if((x0 & 1) != 0)
3233							{
3234								target[(x0 & ~1) * 2 + 1] = fill;
3235								target[(x0 & ~1) * 2 + 3] = fill;
3236							}
3237
3238							memfill4(&target[((x0 + 1) & ~1) * 2], fill, ((x1 + 1) & ~1) * 2 - ((x0 + 1) & ~1) * 2);
3239
3240							if((x1 & 1) != 0)
3241							{
3242								target[(x1 & ~1) * 2 + 0] = fill;
3243								target[(x1 & ~1) * 2 + 2] = fill;
3244							}
3245
3246							y++;
3247						}
3248						else
3249						{
3250							for(int x = x0; x < x1; x++)
3251							{
3252								target[(x & ~1) * 2 + (x & 1)] = maskedS | (target[x] & invMask);
3253							}
3254						}
3255					}
3256
3257					buffer += stencil.sliceP;
3258				}
3259			}
3260
3261			unlockStencil();
3262		}
3263	}
3264
3265	void Surface::fill(const Color<float> &color, int x0, int y0, int width, int height)
3266	{
3267		unsigned char *row;
3268		Buffer *buffer;
3269
3270		if(internal.dirty)
3271		{
3272			row = (unsigned char*)lockInternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3273			buffer = &internal;
3274		}
3275		else
3276		{
3277			row = (unsigned char*)lockExternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3278			buffer = &external;
3279		}
3280
3281		if(buffer->bytes <= 4)
3282		{
3283			int c;
3284			buffer->write(&c, color);
3285
3286			if(buffer->bytes <= 1) c = (c << 8)  | c;
3287			if(buffer->bytes <= 2) c = (c << 16) | c;
3288
3289			for(int y = 0; y < height; y++)
3290			{
3291				memfill4(row, c, width * buffer->bytes);
3292
3293				row += buffer->pitchB;
3294			}
3295		}
3296		else   // Generic
3297		{
3298			for(int y = 0; y < height; y++)
3299			{
3300				unsigned char *element = row;
3301
3302				for(int x = 0; x < width; x++)
3303				{
3304					buffer->write(element, color);
3305
3306					element += buffer->bytes;
3307				}
3308
3309				row += buffer->pitchB;
3310			}
3311		}
3312
3313		if(buffer == &internal)
3314		{
3315			unlockInternal();
3316		}
3317		else
3318		{
3319			unlockExternal();
3320		}
3321	}
3322
3323	Color<float> Surface::readExternal(int x, int y, int z) const
3324	{
3325		ASSERT(external.lock != LOCK_UNLOCKED);
3326
3327		return external.read(x, y, z);
3328	}
3329
3330	Color<float> Surface::readExternal(int x, int y) const
3331	{
3332		ASSERT(external.lock != LOCK_UNLOCKED);
3333
3334		return external.read(x, y);
3335	}
3336
3337	Color<float> Surface::sampleExternal(float x, float y, float z) const
3338	{
3339		ASSERT(external.lock != LOCK_UNLOCKED);
3340
3341		return external.sample(x, y, z);
3342	}
3343
3344	Color<float> Surface::sampleExternal(float x, float y) const
3345	{
3346		ASSERT(external.lock != LOCK_UNLOCKED);
3347
3348		return external.sample(x, y);
3349	}
3350
3351	void Surface::writeExternal(int x, int y, int z, const Color<float> &color)
3352	{
3353		ASSERT(external.lock != LOCK_UNLOCKED);
3354
3355		external.write(x, y, z, color);
3356	}
3357
3358	void Surface::writeExternal(int x, int y, const Color<float> &color)
3359	{
3360		ASSERT(external.lock != LOCK_UNLOCKED);
3361
3362		external.write(x, y, color);
3363	}
3364
3365	void Surface::copyInternal(const Surface* source, int x, int y, float srcX, float srcY, bool filter)
3366	{
3367		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
3368
3369		sw::Color<float> color;
3370
3371		if(!filter)
3372		{
3373			color = source->internal.read((int)srcX, (int)srcY);
3374		}
3375		else   // Bilinear filtering
3376		{
3377			color = source->internal.sample(srcX, srcY);
3378		}
3379
3380		internal.write(x, y, color);
3381	}
3382
3383	void Surface::copyInternal(const Surface* source, int x, int y, int z, float srcX, float srcY, float srcZ, bool filter)
3384	{
3385		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
3386
3387		sw::Color<float> color;
3388
3389		if(!filter)
3390		{
3391			color = source->internal.read((int)srcX, (int)srcY, int(srcZ));
3392		}
3393		else   // Bilinear filtering
3394		{
3395			color = source->internal.sample(srcX, srcY, srcZ);
3396		}
3397
3398		internal.write(x, y, z, color);
3399	}
3400
3401	bool Surface::hasStencil() const
3402	{
3403		return isStencil(external.format);
3404	}
3405
3406	bool Surface::hasDepth() const
3407	{
3408		return isDepth(external.format);
3409	}
3410
3411	bool Surface::hasPalette() const
3412	{
3413		return isPalette(external.format);
3414	}
3415
3416	bool Surface::isRenderTarget() const
3417	{
3418		return renderTarget;
3419	}
3420
3421	bool Surface::hasDirtyMipmaps() const
3422	{
3423		return dirtyMipmaps;
3424	}
3425
3426	void Surface::cleanMipmaps()
3427	{
3428		dirtyMipmaps = false;
3429	}
3430
3431	Resource *Surface::getResource()
3432	{
3433		return resource;
3434	}
3435
3436	bool Surface::identicalFormats() const
3437	{
3438		return external.format == internal.format &&
3439		       external.width  == internal.width &&
3440		       external.height == internal.height &&
3441		       external.depth  == internal.depth &&
3442		       external.pitchB == internal.pitchB &&
3443		       external.sliceB == internal.sliceB;
3444	}
3445
3446	Format Surface::selectInternalFormat(Format format) const
3447	{
3448		switch(format)
3449		{
3450		case FORMAT_NULL:
3451			return FORMAT_NULL;
3452		case FORMAT_P8:
3453		case FORMAT_A8P8:
3454		case FORMAT_A4R4G4B4:
3455		case FORMAT_A1R5G5B5:
3456		case FORMAT_A8R3G3B2:
3457			return FORMAT_A8R8G8B8;
3458		case FORMAT_A8:
3459			return FORMAT_A8;
3460		case FORMAT_R8:
3461			return FORMAT_R8;
3462		case FORMAT_A2R10G10B10:
3463		case FORMAT_A2B10G10R10:
3464		case FORMAT_A16B16G16R16:
3465			return FORMAT_A16B16G16R16;
3466		case FORMAT_G8R8:
3467			return FORMAT_G8R8;
3468		case FORMAT_G16R16:
3469			return FORMAT_G16R16;
3470		case FORMAT_A8R8G8B8:
3471			if(lockable || !quadLayoutEnabled)
3472			{
3473				return FORMAT_A8R8G8B8;
3474			}
3475			else
3476			{
3477				return FORMAT_A8G8R8B8Q;
3478			}
3479		case FORMAT_R5G5B5A1:
3480		case FORMAT_R4G4B4A4:
3481		case FORMAT_A8B8G8R8:
3482			return FORMAT_A8B8G8R8;
3483		case FORMAT_R5G6B5:
3484			return FORMAT_R5G6B5;
3485		case FORMAT_R3G3B2:
3486		case FORMAT_R8G8B8:
3487		case FORMAT_X4R4G4B4:
3488		case FORMAT_X1R5G5B5:
3489		case FORMAT_X8R8G8B8:
3490			if(lockable || !quadLayoutEnabled)
3491			{
3492				return FORMAT_X8R8G8B8;
3493			}
3494			else
3495			{
3496				return FORMAT_X8G8R8B8Q;
3497			}
3498		case FORMAT_B8G8R8:
3499		case FORMAT_X8B8G8R8:
3500			return FORMAT_X8B8G8R8;
3501		// Compressed formats
3502		#if S3TC_SUPPORT
3503		case FORMAT_DXT1:
3504		case FORMAT_DXT3:
3505		case FORMAT_DXT5:
3506		#endif
3507		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3508		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3509		case FORMAT_RGBA8_ETC2_EAC:
3510		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
3511		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
3512		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
3513		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
3514		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
3515		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
3516		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
3517		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
3518		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
3519		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
3520		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
3521		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
3522		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
3523		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
3524		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
3525			return FORMAT_A8R8G8B8;
3526		case FORMAT_RGBA_ASTC_4x4_KHR:
3527		case FORMAT_RGBA_ASTC_5x4_KHR:
3528		case FORMAT_RGBA_ASTC_5x5_KHR:
3529		case FORMAT_RGBA_ASTC_6x5_KHR:
3530		case FORMAT_RGBA_ASTC_6x6_KHR:
3531		case FORMAT_RGBA_ASTC_8x5_KHR:
3532		case FORMAT_RGBA_ASTC_8x6_KHR:
3533		case FORMAT_RGBA_ASTC_8x8_KHR:
3534		case FORMAT_RGBA_ASTC_10x5_KHR:
3535		case FORMAT_RGBA_ASTC_10x6_KHR:
3536		case FORMAT_RGBA_ASTC_10x8_KHR:
3537		case FORMAT_RGBA_ASTC_10x10_KHR:
3538		case FORMAT_RGBA_ASTC_12x10_KHR:
3539		case FORMAT_RGBA_ASTC_12x12_KHR:
3540			// ASTC supports HDR, so a floating point format is required to represent it properly
3541			return FORMAT_A32B32G32R32F; // FIXME: 16FP is probably sufficient, but it's currently unsupported
3542		case FORMAT_ATI1:
3543		case FORMAT_R11_EAC:
3544			return FORMAT_R8;
3545		case FORMAT_SIGNED_R11_EAC:
3546			return FORMAT_R32F; // FIXME: Signed 8bit format would be sufficient
3547		case FORMAT_ATI2:
3548		case FORMAT_RG11_EAC:
3549			return FORMAT_G8R8;
3550		case FORMAT_SIGNED_RG11_EAC:
3551			return FORMAT_G32R32F; // FIXME: Signed 8bit format would be sufficient
3552		case FORMAT_ETC1:
3553		case FORMAT_RGB8_ETC2:
3554		case FORMAT_SRGB8_ETC2:
3555			return FORMAT_X8R8G8B8;
3556		// Bumpmap formats
3557		case FORMAT_V8U8:			return FORMAT_V8U8;
3558		case FORMAT_L6V5U5:			return FORMAT_X8L8V8U8;
3559		case FORMAT_Q8W8V8U8:		return FORMAT_Q8W8V8U8;
3560		case FORMAT_X8L8V8U8:		return FORMAT_X8L8V8U8;
3561		case FORMAT_V16U16:			return FORMAT_V16U16;
3562		case FORMAT_A2W10V10U10:	return FORMAT_A16W16V16U16;
3563		case FORMAT_Q16W16V16U16:	return FORMAT_Q16W16V16U16;
3564		// Floating-point formats
3565		case FORMAT_A16F:			return FORMAT_A32B32G32R32F;
3566		case FORMAT_R16F:			return FORMAT_R32F;
3567		case FORMAT_G16R16F:		return FORMAT_G32R32F;
3568		case FORMAT_B16G16R16F:     return FORMAT_A32B32G32R32F;
3569		case FORMAT_A16B16G16R16F:	return FORMAT_A32B32G32R32F;
3570		case FORMAT_A32F:			return FORMAT_A32B32G32R32F;
3571		case FORMAT_R32F:			return FORMAT_R32F;
3572		case FORMAT_G32R32F:		return FORMAT_G32R32F;
3573		case FORMAT_B32G32R32F:     return FORMAT_A32B32G32R32F;
3574		case FORMAT_A32B32G32R32F:	return FORMAT_A32B32G32R32F;
3575		// Luminance formats
3576		case FORMAT_L8:				return FORMAT_L8;
3577		case FORMAT_A4L4:			return FORMAT_A8L8;
3578		case FORMAT_L16:			return FORMAT_L16;
3579		case FORMAT_A8L8:			return FORMAT_A8L8;
3580		case FORMAT_L16F:           return FORMAT_A32B32G32R32F;
3581		case FORMAT_A16L16F:        return FORMAT_A32B32G32R32F;
3582		case FORMAT_L32F:           return FORMAT_A32B32G32R32F;
3583		case FORMAT_A32L32F:        return FORMAT_A32B32G32R32F;
3584		// Depth/stencil formats
3585		case FORMAT_D16:
3586		case FORMAT_D32:
3587		case FORMAT_D24X8:
3588		case FORMAT_D24S8:
3589		case FORMAT_D24FS8:
3590			if(hasParent)   // Texture
3591			{
3592				return FORMAT_D32FS8_SHADOW;
3593			}
3594			else if(complementaryDepthBuffer)
3595			{
3596				return FORMAT_D32F_COMPLEMENTARY;
3597			}
3598			else
3599			{
3600				return FORMAT_D32F;
3601			}
3602		case FORMAT_D32F_LOCKABLE:  return FORMAT_D32F_LOCKABLE;
3603		case FORMAT_D32FS8_TEXTURE: return FORMAT_D32FS8_TEXTURE;
3604		case FORMAT_INTZ:           return FORMAT_D32FS8_TEXTURE;
3605		case FORMAT_DF24S8:         return FORMAT_D32FS8_SHADOW;
3606		case FORMAT_DF16S8:         return FORMAT_D32FS8_SHADOW;
3607		case FORMAT_YV12_BT601:     return FORMAT_YV12_BT601;
3608		case FORMAT_YV12_BT709:     return FORMAT_YV12_BT709;
3609		case FORMAT_YV12_JFIF:      return FORMAT_YV12_JFIF;
3610		default:
3611			ASSERT(false);
3612		}
3613
3614		return FORMAT_NULL;
3615	}
3616
3617	void Surface::setTexturePalette(unsigned int *palette)
3618	{
3619		Surface::palette = palette;
3620		Surface::paletteID++;
3621	}
3622
3623	void Surface::resolve()
3624	{
3625		if(internal.depth <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL)
3626		{
3627			return;
3628		}
3629
3630		void *source = internal.lockRect(0, 0, 0, LOCK_READWRITE);
3631
3632		int quality = internal.depth;
3633		int width = internal.width;
3634		int height = internal.height;
3635		int pitch = internal.pitchB;
3636		int slice = internal.sliceB;
3637
3638		unsigned char *source0 = (unsigned char*)source;
3639		unsigned char *source1 = source0 + slice;
3640		unsigned char *source2 = source1 + slice;
3641		unsigned char *source3 = source2 + slice;
3642		unsigned char *source4 = source3 + slice;
3643		unsigned char *source5 = source4 + slice;
3644		unsigned char *source6 = source5 + slice;
3645		unsigned char *source7 = source6 + slice;
3646		unsigned char *source8 = source7 + slice;
3647		unsigned char *source9 = source8 + slice;
3648		unsigned char *sourceA = source9 + slice;
3649		unsigned char *sourceB = sourceA + slice;
3650		unsigned char *sourceC = sourceB + slice;
3651		unsigned char *sourceD = sourceC + slice;
3652		unsigned char *sourceE = sourceD + slice;
3653		unsigned char *sourceF = sourceE + slice;
3654
3655		if(internal.format == FORMAT_X8R8G8B8 || internal.format == FORMAT_A8R8G8B8 || internal.format == FORMAT_X8B8G8R8 || internal.format == FORMAT_A8B8G8R8)
3656		{
3657			if(CPUID::supportsSSE2() && (width % 4) == 0)
3658			{
3659				if(internal.depth == 2)
3660				{
3661					for(int y = 0; y < height; y++)
3662					{
3663						for(int x = 0; x < width; x += 4)
3664						{
3665							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3666							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3667
3668							c0 = _mm_avg_epu8(c0, c1);
3669
3670							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3671						}
3672
3673						source0 += pitch;
3674						source1 += pitch;
3675					}
3676				}
3677				else if(internal.depth == 4)
3678				{
3679					for(int y = 0; y < height; y++)
3680					{
3681						for(int x = 0; x < width; x += 4)
3682						{
3683							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3684							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3685							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3686							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3687
3688							c0 = _mm_avg_epu8(c0, c1);
3689							c2 = _mm_avg_epu8(c2, c3);
3690							c0 = _mm_avg_epu8(c0, c2);
3691
3692							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3693						}
3694
3695						source0 += pitch;
3696						source1 += pitch;
3697						source2 += pitch;
3698						source3 += pitch;
3699					}
3700				}
3701				else if(internal.depth == 8)
3702				{
3703					for(int y = 0; y < height; y++)
3704					{
3705						for(int x = 0; x < width; x += 4)
3706						{
3707							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3708							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3709							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3710							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3711							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3712							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3713							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3714							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3715
3716							c0 = _mm_avg_epu8(c0, c1);
3717							c2 = _mm_avg_epu8(c2, c3);
3718							c4 = _mm_avg_epu8(c4, c5);
3719							c6 = _mm_avg_epu8(c6, c7);
3720							c0 = _mm_avg_epu8(c0, c2);
3721							c4 = _mm_avg_epu8(c4, c6);
3722							c0 = _mm_avg_epu8(c0, c4);
3723
3724							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3725						}
3726
3727						source0 += pitch;
3728						source1 += pitch;
3729						source2 += pitch;
3730						source3 += pitch;
3731						source4 += pitch;
3732						source5 += pitch;
3733						source6 += pitch;
3734						source7 += pitch;
3735					}
3736				}
3737				else if(internal.depth == 16)
3738				{
3739					for(int y = 0; y < height; y++)
3740					{
3741						for(int x = 0; x < width; x += 4)
3742						{
3743							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3744							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3745							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3746							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3747							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3748							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3749							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3750							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3751							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
3752							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
3753							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
3754							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
3755							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
3756							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
3757							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
3758							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
3759
3760							c0 = _mm_avg_epu8(c0, c1);
3761							c2 = _mm_avg_epu8(c2, c3);
3762							c4 = _mm_avg_epu8(c4, c5);
3763							c6 = _mm_avg_epu8(c6, c7);
3764							c8 = _mm_avg_epu8(c8, c9);
3765							cA = _mm_avg_epu8(cA, cB);
3766							cC = _mm_avg_epu8(cC, cD);
3767							cE = _mm_avg_epu8(cE, cF);
3768							c0 = _mm_avg_epu8(c0, c2);
3769							c4 = _mm_avg_epu8(c4, c6);
3770							c8 = _mm_avg_epu8(c8, cA);
3771							cC = _mm_avg_epu8(cC, cE);
3772							c0 = _mm_avg_epu8(c0, c4);
3773							c8 = _mm_avg_epu8(c8, cC);
3774							c0 = _mm_avg_epu8(c0, c8);
3775
3776							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3777						}
3778
3779						source0 += pitch;
3780						source1 += pitch;
3781						source2 += pitch;
3782						source3 += pitch;
3783						source4 += pitch;
3784						source5 += pitch;
3785						source6 += pitch;
3786						source7 += pitch;
3787						source8 += pitch;
3788						source9 += pitch;
3789						sourceA += pitch;
3790						sourceB += pitch;
3791						sourceC += pitch;
3792						sourceD += pitch;
3793						sourceE += pitch;
3794						sourceF += pitch;
3795					}
3796				}
3797				else ASSERT(false);
3798			}
3799			else
3800			{
3801				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))
3802
3803				if(internal.depth == 2)
3804				{
3805					for(int y = 0; y < height; y++)
3806					{
3807						for(int x = 0; x < width; x++)
3808						{
3809							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3810							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3811
3812							c0 = AVERAGE(c0, c1);
3813
3814							*(unsigned int*)(source0 + 4 * x) = c0;
3815						}
3816
3817						source0 += pitch;
3818						source1 += pitch;
3819					}
3820				}
3821				else if(internal.depth == 4)
3822				{
3823					for(int y = 0; y < height; y++)
3824					{
3825						for(int x = 0; x < width; x++)
3826						{
3827							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3828							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3829							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3830							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3831
3832							c0 = AVERAGE(c0, c1);
3833							c2 = AVERAGE(c2, c3);
3834							c0 = AVERAGE(c0, c2);
3835
3836							*(unsigned int*)(source0 + 4 * x) = c0;
3837						}
3838
3839						source0 += pitch;
3840						source1 += pitch;
3841						source2 += pitch;
3842						source3 += pitch;
3843					}
3844				}
3845				else if(internal.depth == 8)
3846				{
3847					for(int y = 0; y < height; y++)
3848					{
3849						for(int x = 0; x < width; x++)
3850						{
3851							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3852							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3853							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3854							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3855							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3856							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3857							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3858							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3859
3860							c0 = AVERAGE(c0, c1);
3861							c2 = AVERAGE(c2, c3);
3862							c4 = AVERAGE(c4, c5);
3863							c6 = AVERAGE(c6, c7);
3864							c0 = AVERAGE(c0, c2);
3865							c4 = AVERAGE(c4, c6);
3866							c0 = AVERAGE(c0, c4);
3867
3868							*(unsigned int*)(source0 + 4 * x) = c0;
3869						}
3870
3871						source0 += pitch;
3872						source1 += pitch;
3873						source2 += pitch;
3874						source3 += pitch;
3875						source4 += pitch;
3876						source5 += pitch;
3877						source6 += pitch;
3878						source7 += pitch;
3879					}
3880				}
3881				else if(internal.depth == 16)
3882				{
3883					for(int y = 0; y < height; y++)
3884					{
3885						for(int x = 0; x < width; x++)
3886						{
3887							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3888							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3889							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3890							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3891							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3892							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3893							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3894							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3895							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
3896							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
3897							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
3898							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
3899							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
3900							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
3901							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
3902							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
3903
3904							c0 = AVERAGE(c0, c1);
3905							c2 = AVERAGE(c2, c3);
3906							c4 = AVERAGE(c4, c5);
3907							c6 = AVERAGE(c6, c7);
3908							c8 = AVERAGE(c8, c9);
3909							cA = AVERAGE(cA, cB);
3910							cC = AVERAGE(cC, cD);
3911							cE = AVERAGE(cE, cF);
3912							c0 = AVERAGE(c0, c2);
3913							c4 = AVERAGE(c4, c6);
3914							c8 = AVERAGE(c8, cA);
3915							cC = AVERAGE(cC, cE);
3916							c0 = AVERAGE(c0, c4);
3917							c8 = AVERAGE(c8, cC);
3918							c0 = AVERAGE(c0, c8);
3919
3920							*(unsigned int*)(source0 + 4 * x) = c0;
3921						}
3922
3923						source0 += pitch;
3924						source1 += pitch;
3925						source2 += pitch;
3926						source3 += pitch;
3927						source4 += pitch;
3928						source5 += pitch;
3929						source6 += pitch;
3930						source7 += pitch;
3931						source8 += pitch;
3932						source9 += pitch;
3933						sourceA += pitch;
3934						sourceB += pitch;
3935						sourceC += pitch;
3936						sourceD += pitch;
3937						sourceE += pitch;
3938						sourceF += pitch;
3939					}
3940				}
3941				else ASSERT(false);
3942
3943				#undef AVERAGE
3944			}
3945		}
3946		else if(internal.format == FORMAT_G16R16)
3947		{
3948			if(CPUID::supportsSSE2() && (width % 4) == 0)
3949			{
3950				if(internal.depth == 2)
3951				{
3952					for(int y = 0; y < height; y++)
3953					{
3954						for(int x = 0; x < width; x += 4)
3955						{
3956							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3957							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3958
3959							c0 = _mm_avg_epu16(c0, c1);
3960
3961							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3962						}
3963
3964						source0 += pitch;
3965						source1 += pitch;
3966					}
3967				}
3968				else if(internal.depth == 4)
3969				{
3970					for(int y = 0; y < height; y++)
3971					{
3972						for(int x = 0; x < width; x += 4)
3973						{
3974							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3975							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3976							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3977							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3978
3979							c0 = _mm_avg_epu16(c0, c1);
3980							c2 = _mm_avg_epu16(c2, c3);
3981							c0 = _mm_avg_epu16(c0, c2);
3982
3983							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3984						}
3985
3986						source0 += pitch;
3987						source1 += pitch;
3988						source2 += pitch;
3989						source3 += pitch;
3990					}
3991				}
3992				else if(internal.depth == 8)
3993				{
3994					for(int y = 0; y < height; y++)
3995					{
3996						for(int x = 0; x < width; x += 4)
3997						{
3998							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3999							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4000							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4001							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4002							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4003							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4004							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4005							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4006
4007							c0 = _mm_avg_epu16(c0, c1);
4008							c2 = _mm_avg_epu16(c2, c3);
4009							c4 = _mm_avg_epu16(c4, c5);
4010							c6 = _mm_avg_epu16(c6, c7);
4011							c0 = _mm_avg_epu16(c0, c2);
4012							c4 = _mm_avg_epu16(c4, c6);
4013							c0 = _mm_avg_epu16(c0, c4);
4014
4015							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4016						}
4017
4018						source0 += pitch;
4019						source1 += pitch;
4020						source2 += pitch;
4021						source3 += pitch;
4022						source4 += pitch;
4023						source5 += pitch;
4024						source6 += pitch;
4025						source7 += pitch;
4026					}
4027				}
4028				else if(internal.depth == 16)
4029				{
4030					for(int y = 0; y < height; y++)
4031					{
4032						for(int x = 0; x < width; x += 4)
4033						{
4034							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4035							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4036							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4037							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4038							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4039							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4040							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4041							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4042							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
4043							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
4044							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
4045							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
4046							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
4047							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
4048							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
4049							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
4050
4051							c0 = _mm_avg_epu16(c0, c1);
4052							c2 = _mm_avg_epu16(c2, c3);
4053							c4 = _mm_avg_epu16(c4, c5);
4054							c6 = _mm_avg_epu16(c6, c7);
4055							c8 = _mm_avg_epu16(c8, c9);
4056							cA = _mm_avg_epu16(cA, cB);
4057							cC = _mm_avg_epu16(cC, cD);
4058							cE = _mm_avg_epu16(cE, cF);
4059							c0 = _mm_avg_epu16(c0, c2);
4060							c4 = _mm_avg_epu16(c4, c6);
4061							c8 = _mm_avg_epu16(c8, cA);
4062							cC = _mm_avg_epu16(cC, cE);
4063							c0 = _mm_avg_epu16(c0, c4);
4064							c8 = _mm_avg_epu16(c8, cC);
4065							c0 = _mm_avg_epu16(c0, c8);
4066
4067							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4068						}
4069
4070						source0 += pitch;
4071						source1 += pitch;
4072						source2 += pitch;
4073						source3 += pitch;
4074						source4 += pitch;
4075						source5 += pitch;
4076						source6 += pitch;
4077						source7 += pitch;
4078						source8 += pitch;
4079						source9 += pitch;
4080						sourceA += pitch;
4081						sourceB += pitch;
4082						sourceC += pitch;
4083						sourceD += pitch;
4084						sourceE += pitch;
4085						sourceF += pitch;
4086					}
4087				}
4088				else ASSERT(false);
4089			}
4090			else
4091			{
4092				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
4093
4094				if(internal.depth == 2)
4095				{
4096					for(int y = 0; y < height; y++)
4097					{
4098						for(int x = 0; x < width; x++)
4099						{
4100							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4101							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4102
4103							c0 = AVERAGE(c0, c1);
4104
4105							*(unsigned int*)(source0 + 4 * x) = c0;
4106						}
4107
4108						source0 += pitch;
4109						source1 += pitch;
4110					}
4111				}
4112				else if(internal.depth == 4)
4113				{
4114					for(int y = 0; y < height; y++)
4115					{
4116						for(int x = 0; x < width; x++)
4117						{
4118							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4119							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4120							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4121							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4122
4123							c0 = AVERAGE(c0, c1);
4124							c2 = AVERAGE(c2, c3);
4125							c0 = AVERAGE(c0, c2);
4126
4127							*(unsigned int*)(source0 + 4 * x) = c0;
4128						}
4129
4130						source0 += pitch;
4131						source1 += pitch;
4132						source2 += pitch;
4133						source3 += pitch;
4134					}
4135				}
4136				else if(internal.depth == 8)
4137				{
4138					for(int y = 0; y < height; y++)
4139					{
4140						for(int x = 0; x < width; x++)
4141						{
4142							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4143							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4144							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4145							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4146							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4147							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4148							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4149							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4150
4151							c0 = AVERAGE(c0, c1);
4152							c2 = AVERAGE(c2, c3);
4153							c4 = AVERAGE(c4, c5);
4154							c6 = AVERAGE(c6, c7);
4155							c0 = AVERAGE(c0, c2);
4156							c4 = AVERAGE(c4, c6);
4157							c0 = AVERAGE(c0, c4);
4158
4159							*(unsigned int*)(source0 + 4 * x) = c0;
4160						}
4161
4162						source0 += pitch;
4163						source1 += pitch;
4164						source2 += pitch;
4165						source3 += pitch;
4166						source4 += pitch;
4167						source5 += pitch;
4168						source6 += pitch;
4169						source7 += pitch;
4170					}
4171				}
4172				else if(internal.depth == 16)
4173				{
4174					for(int y = 0; y < height; y++)
4175					{
4176						for(int x = 0; x < width; x++)
4177						{
4178							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4179							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4180							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4181							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4182							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4183							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4184							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4185							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4186							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4187							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4188							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4189							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4190							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4191							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4192							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4193							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4194
4195							c0 = AVERAGE(c0, c1);
4196							c2 = AVERAGE(c2, c3);
4197							c4 = AVERAGE(c4, c5);
4198							c6 = AVERAGE(c6, c7);
4199							c8 = AVERAGE(c8, c9);
4200							cA = AVERAGE(cA, cB);
4201							cC = AVERAGE(cC, cD);
4202							cE = AVERAGE(cE, cF);
4203							c0 = AVERAGE(c0, c2);
4204							c4 = AVERAGE(c4, c6);
4205							c8 = AVERAGE(c8, cA);
4206							cC = AVERAGE(cC, cE);
4207							c0 = AVERAGE(c0, c4);
4208							c8 = AVERAGE(c8, cC);
4209							c0 = AVERAGE(c0, c8);
4210
4211							*(unsigned int*)(source0 + 4 * x) = c0;
4212						}
4213
4214						source0 += pitch;
4215						source1 += pitch;
4216						source2 += pitch;
4217						source3 += pitch;
4218						source4 += pitch;
4219						source5 += pitch;
4220						source6 += pitch;
4221						source7 += pitch;
4222						source8 += pitch;
4223						source9 += pitch;
4224						sourceA += pitch;
4225						sourceB += pitch;
4226						sourceC += pitch;
4227						sourceD += pitch;
4228						sourceE += pitch;
4229						sourceF += pitch;
4230					}
4231				}
4232				else ASSERT(false);
4233
4234				#undef AVERAGE
4235			}
4236		}
4237		else if(internal.format == FORMAT_A16B16G16R16)
4238		{
4239			if(CPUID::supportsSSE2() && (width % 2) == 0)
4240			{
4241				if(internal.depth == 2)
4242				{
4243					for(int y = 0; y < height; y++)
4244					{
4245						for(int x = 0; x < width; x += 2)
4246						{
4247							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4248							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4249
4250							c0 = _mm_avg_epu16(c0, c1);
4251
4252							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4253						}
4254
4255						source0 += pitch;
4256						source1 += pitch;
4257					}
4258				}
4259				else if(internal.depth == 4)
4260				{
4261					for(int y = 0; y < height; y++)
4262					{
4263						for(int x = 0; x < width; x += 2)
4264						{
4265							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4266							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4267							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4268							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4269
4270							c0 = _mm_avg_epu16(c0, c1);
4271							c2 = _mm_avg_epu16(c2, c3);
4272							c0 = _mm_avg_epu16(c0, c2);
4273
4274							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4275						}
4276
4277						source0 += pitch;
4278						source1 += pitch;
4279						source2 += pitch;
4280						source3 += pitch;
4281					}
4282				}
4283				else if(internal.depth == 8)
4284				{
4285					for(int y = 0; y < height; y++)
4286					{
4287						for(int x = 0; x < width; x += 2)
4288						{
4289							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4290							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4291							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4292							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4293							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4294							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4295							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4296							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4297
4298							c0 = _mm_avg_epu16(c0, c1);
4299							c2 = _mm_avg_epu16(c2, c3);
4300							c4 = _mm_avg_epu16(c4, c5);
4301							c6 = _mm_avg_epu16(c6, c7);
4302							c0 = _mm_avg_epu16(c0, c2);
4303							c4 = _mm_avg_epu16(c4, c6);
4304							c0 = _mm_avg_epu16(c0, c4);
4305
4306							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4307						}
4308
4309						source0 += pitch;
4310						source1 += pitch;
4311						source2 += pitch;
4312						source3 += pitch;
4313						source4 += pitch;
4314						source5 += pitch;
4315						source6 += pitch;
4316						source7 += pitch;
4317					}
4318				}
4319				else if(internal.depth == 16)
4320				{
4321					for(int y = 0; y < height; y++)
4322					{
4323						for(int x = 0; x < width; x += 2)
4324						{
4325							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4326							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4327							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4328							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4329							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4330							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4331							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4332							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4333							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
4334							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
4335							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
4336							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
4337							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
4338							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
4339							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
4340							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
4341
4342							c0 = _mm_avg_epu16(c0, c1);
4343							c2 = _mm_avg_epu16(c2, c3);
4344							c4 = _mm_avg_epu16(c4, c5);
4345							c6 = _mm_avg_epu16(c6, c7);
4346							c8 = _mm_avg_epu16(c8, c9);
4347							cA = _mm_avg_epu16(cA, cB);
4348							cC = _mm_avg_epu16(cC, cD);
4349							cE = _mm_avg_epu16(cE, cF);
4350							c0 = _mm_avg_epu16(c0, c2);
4351							c4 = _mm_avg_epu16(c4, c6);
4352							c8 = _mm_avg_epu16(c8, cA);
4353							cC = _mm_avg_epu16(cC, cE);
4354							c0 = _mm_avg_epu16(c0, c4);
4355							c8 = _mm_avg_epu16(c8, cC);
4356							c0 = _mm_avg_epu16(c0, c8);
4357
4358							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4359						}
4360
4361						source0 += pitch;
4362						source1 += pitch;
4363						source2 += pitch;
4364						source3 += pitch;
4365						source4 += pitch;
4366						source5 += pitch;
4367						source6 += pitch;
4368						source7 += pitch;
4369						source8 += pitch;
4370						source9 += pitch;
4371						sourceA += pitch;
4372						sourceB += pitch;
4373						sourceC += pitch;
4374						sourceD += pitch;
4375						sourceE += pitch;
4376						sourceF += pitch;
4377					}
4378				}
4379				else ASSERT(false);
4380			}
4381			else
4382			{
4383				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
4384
4385				if(internal.depth == 2)
4386				{
4387					for(int y = 0; y < height; y++)
4388					{
4389						for(int x = 0; x < 2 * width; x++)
4390						{
4391							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4392							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4393
4394							c0 = AVERAGE(c0, c1);
4395
4396							*(unsigned int*)(source0 + 4 * x) = c0;
4397						}
4398
4399						source0 += pitch;
4400						source1 += pitch;
4401					}
4402				}
4403				else if(internal.depth == 4)
4404				{
4405					for(int y = 0; y < height; y++)
4406					{
4407						for(int x = 0; x < 2 * width; x++)
4408						{
4409							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4410							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4411							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4412							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4413
4414							c0 = AVERAGE(c0, c1);
4415							c2 = AVERAGE(c2, c3);
4416							c0 = AVERAGE(c0, c2);
4417
4418							*(unsigned int*)(source0 + 4 * x) = c0;
4419						}
4420
4421						source0 += pitch;
4422						source1 += pitch;
4423						source2 += pitch;
4424						source3 += pitch;
4425					}
4426				}
4427				else if(internal.depth == 8)
4428				{
4429					for(int y = 0; y < height; y++)
4430					{
4431						for(int x = 0; x < 2 * width; x++)
4432						{
4433							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4434							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4435							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4436							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4437							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4438							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4439							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4440							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4441
4442							c0 = AVERAGE(c0, c1);
4443							c2 = AVERAGE(c2, c3);
4444							c4 = AVERAGE(c4, c5);
4445							c6 = AVERAGE(c6, c7);
4446							c0 = AVERAGE(c0, c2);
4447							c4 = AVERAGE(c4, c6);
4448							c0 = AVERAGE(c0, c4);
4449
4450							*(unsigned int*)(source0 + 4 * x) = c0;
4451						}
4452
4453						source0 += pitch;
4454						source1 += pitch;
4455						source2 += pitch;
4456						source3 += pitch;
4457						source4 += pitch;
4458						source5 += pitch;
4459						source6 += pitch;
4460						source7 += pitch;
4461					}
4462				}
4463				else if(internal.depth == 16)
4464				{
4465					for(int y = 0; y < height; y++)
4466					{
4467						for(int x = 0; x < 2 * width; x++)
4468						{
4469							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4470							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4471							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4472							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4473							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4474							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4475							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4476							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4477							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4478							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4479							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4480							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4481							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4482							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4483							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4484							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4485
4486							c0 = AVERAGE(c0, c1);
4487							c2 = AVERAGE(c2, c3);
4488							c4 = AVERAGE(c4, c5);
4489							c6 = AVERAGE(c6, c7);
4490							c8 = AVERAGE(c8, c9);
4491							cA = AVERAGE(cA, cB);
4492							cC = AVERAGE(cC, cD);
4493							cE = AVERAGE(cE, cF);
4494							c0 = AVERAGE(c0, c2);
4495							c4 = AVERAGE(c4, c6);
4496							c8 = AVERAGE(c8, cA);
4497							cC = AVERAGE(cC, cE);
4498							c0 = AVERAGE(c0, c4);
4499							c8 = AVERAGE(c8, cC);
4500							c0 = AVERAGE(c0, c8);
4501
4502							*(unsigned int*)(source0 + 4 * x) = c0;
4503						}
4504
4505						source0 += pitch;
4506						source1 += pitch;
4507						source2 += pitch;
4508						source3 += pitch;
4509						source4 += pitch;
4510						source5 += pitch;
4511						source6 += pitch;
4512						source7 += pitch;
4513						source8 += pitch;
4514						source9 += pitch;
4515						sourceA += pitch;
4516						sourceB += pitch;
4517						sourceC += pitch;
4518						sourceD += pitch;
4519						sourceE += pitch;
4520						sourceF += pitch;
4521					}
4522				}
4523				else ASSERT(false);
4524
4525				#undef AVERAGE
4526			}
4527		}
4528		else if(internal.format == FORMAT_R32F)
4529		{
4530			if(CPUID::supportsSSE() && (width % 4) == 0)
4531			{
4532				if(internal.depth == 2)
4533				{
4534					for(int y = 0; y < height; y++)
4535					{
4536						for(int x = 0; x < width; x += 4)
4537						{
4538							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4539							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4540
4541							c0 = _mm_add_ps(c0, c1);
4542							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
4543
4544							_mm_store_ps((float*)(source0 + 4 * x), c0);
4545						}
4546
4547						source0 += pitch;
4548						source1 += pitch;
4549					}
4550				}
4551				else if(internal.depth == 4)
4552				{
4553					for(int y = 0; y < height; y++)
4554					{
4555						for(int x = 0; x < width; x += 4)
4556						{
4557							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4558							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4559							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4560							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4561
4562							c0 = _mm_add_ps(c0, c1);
4563							c2 = _mm_add_ps(c2, c3);
4564							c0 = _mm_add_ps(c0, c2);
4565							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
4566
4567							_mm_store_ps((float*)(source0 + 4 * x), c0);
4568						}
4569
4570						source0 += pitch;
4571						source1 += pitch;
4572						source2 += pitch;
4573						source3 += pitch;
4574					}
4575				}
4576				else if(internal.depth == 8)
4577				{
4578					for(int y = 0; y < height; y++)
4579					{
4580						for(int x = 0; x < width; x += 4)
4581						{
4582							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4583							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4584							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4585							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4586							__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
4587							__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
4588							__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
4589							__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
4590
4591							c0 = _mm_add_ps(c0, c1);
4592							c2 = _mm_add_ps(c2, c3);
4593							c4 = _mm_add_ps(c4, c5);
4594							c6 = _mm_add_ps(c6, c7);
4595							c0 = _mm_add_ps(c0, c2);
4596							c4 = _mm_add_ps(c4, c6);
4597							c0 = _mm_add_ps(c0, c4);
4598							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
4599
4600							_mm_store_ps((float*)(source0 + 4 * x), c0);
4601						}
4602
4603						source0 += pitch;
4604						source1 += pitch;
4605						source2 += pitch;
4606						source3 += pitch;
4607						source4 += pitch;
4608						source5 += pitch;
4609						source6 += pitch;
4610						source7 += pitch;
4611					}
4612				}
4613				else if(internal.depth == 16)
4614				{
4615					for(int y = 0; y < height; y++)
4616					{
4617						for(int x = 0; x < width; x += 4)
4618						{
4619							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4620							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4621							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4622							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4623							__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
4624							__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
4625							__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
4626							__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
4627							__m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
4628							__m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
4629							__m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
4630							__m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
4631							__m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
4632							__m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
4633							__m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
4634							__m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
4635
4636							c0 = _mm_add_ps(c0, c1);
4637							c2 = _mm_add_ps(c2, c3);
4638							c4 = _mm_add_ps(c4, c5);
4639							c6 = _mm_add_ps(c6, c7);
4640							c8 = _mm_add_ps(c8, c9);
4641							cA = _mm_add_ps(cA, cB);
4642							cC = _mm_add_ps(cC, cD);
4643							cE = _mm_add_ps(cE, cF);
4644							c0 = _mm_add_ps(c0, c2);
4645							c4 = _mm_add_ps(c4, c6);
4646							c8 = _mm_add_ps(c8, cA);
4647							cC = _mm_add_ps(cC, cE);
4648							c0 = _mm_add_ps(c0, c4);
4649							c8 = _mm_add_ps(c8, cC);
4650							c0 = _mm_add_ps(c0, c8);
4651							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
4652
4653							_mm_store_ps((float*)(source0 + 4 * x), c0);
4654						}
4655
4656						source0 += pitch;
4657						source1 += pitch;
4658						source2 += pitch;
4659						source3 += pitch;
4660						source4 += pitch;
4661						source5 += pitch;
4662						source6 += pitch;
4663						source7 += pitch;
4664						source8 += pitch;
4665						source9 += pitch;
4666						sourceA += pitch;
4667						sourceB += pitch;
4668						sourceC += pitch;
4669						sourceD += pitch;
4670						sourceE += pitch;
4671						sourceF += pitch;
4672					}
4673				}
4674				else ASSERT(false);
4675			}
4676			else
4677			{
4678				if(internal.depth == 2)
4679				{
4680					for(int y = 0; y < height; y++)
4681					{
4682						for(int x = 0; x < width; x++)
4683						{
4684							float c0 = *(float*)(source0 + 4 * x);
4685							float c1 = *(float*)(source1 + 4 * x);
4686
4687							c0 = c0 + c1;
4688							c0 *= 1.0f / 2.0f;
4689
4690							*(float*)(source0 + 4 * x) = c0;
4691						}
4692
4693						source0 += pitch;
4694						source1 += pitch;
4695					}
4696				}
4697				else if(internal.depth == 4)
4698				{
4699					for(int y = 0; y < height; y++)
4700					{
4701						for(int x = 0; x < width; x++)
4702						{
4703							float c0 = *(float*)(source0 + 4 * x);
4704							float c1 = *(float*)(source1 + 4 * x);
4705							float c2 = *(float*)(source2 + 4 * x);
4706							float c3 = *(float*)(source3 + 4 * x);
4707
4708							c0 = c0 + c1;
4709							c2 = c2 + c3;
4710							c0 = c0 + c2;
4711							c0 *= 1.0f / 4.0f;
4712
4713							*(float*)(source0 + 4 * x) = c0;
4714						}
4715
4716						source0 += pitch;
4717						source1 += pitch;
4718						source2 += pitch;
4719						source3 += pitch;
4720					}
4721				}
4722				else if(internal.depth == 8)
4723				{
4724					for(int y = 0; y < height; y++)
4725					{
4726						for(int x = 0; x < width; x++)
4727						{
4728							float c0 = *(float*)(source0 + 4 * x);
4729							float c1 = *(float*)(source1 + 4 * x);
4730							float c2 = *(float*)(source2 + 4 * x);
4731							float c3 = *(float*)(source3 + 4 * x);
4732							float c4 = *(float*)(source4 + 4 * x);
4733							float c5 = *(float*)(source5 + 4 * x);
4734							float c6 = *(float*)(source6 + 4 * x);
4735							float c7 = *(float*)(source7 + 4 * x);
4736
4737							c0 = c0 + c1;
4738							c2 = c2 + c3;
4739							c4 = c4 + c5;
4740							c6 = c6 + c7;
4741							c0 = c0 + c2;
4742							c4 = c4 + c6;
4743							c0 = c0 + c4;
4744							c0 *= 1.0f / 8.0f;
4745
4746							*(float*)(source0 + 4 * x) = c0;
4747						}
4748
4749						source0 += pitch;
4750						source1 += pitch;
4751						source2 += pitch;
4752						source3 += pitch;
4753						source4 += pitch;
4754						source5 += pitch;
4755						source6 += pitch;
4756						source7 += pitch;
4757					}
4758				}
4759				else if(internal.depth == 16)
4760				{
4761					for(int y = 0; y < height; y++)
4762					{
4763						for(int x = 0; x < width; x++)
4764						{
4765							float c0 = *(float*)(source0 + 4 * x);
4766							float c1 = *(float*)(source1 + 4 * x);
4767							float c2 = *(float*)(source2 + 4 * x);
4768							float c3 = *(float*)(source3 + 4 * x);
4769							float c4 = *(float*)(source4 + 4 * x);
4770							float c5 = *(float*)(source5 + 4 * x);
4771							float c6 = *(float*)(source6 + 4 * x);
4772							float c7 = *(float*)(source7 + 4 * x);
4773							float c8 = *(float*)(source8 + 4 * x);
4774							float c9 = *(float*)(source9 + 4 * x);
4775							float cA = *(float*)(sourceA + 4 * x);
4776							float cB = *(float*)(sourceB + 4 * x);
4777							float cC = *(float*)(sourceC + 4 * x);
4778							float cD = *(float*)(sourceD + 4 * x);
4779							float cE = *(float*)(sourceE + 4 * x);
4780							float cF = *(float*)(sourceF + 4 * x);
4781
4782							c0 = c0 + c1;
4783							c2 = c2 + c3;
4784							c4 = c4 + c5;
4785							c6 = c6 + c7;
4786							c8 = c8 + c9;
4787							cA = cA + cB;
4788							cC = cC + cD;
4789							cE = cE + cF;
4790							c0 = c0 + c2;
4791							c4 = c4 + c6;
4792							c8 = c8 + cA;
4793							cC = cC + cE;
4794							c0 = c0 + c4;
4795							c8 = c8 + cC;
4796							c0 = c0 + c8;
4797							c0 *= 1.0f / 16.0f;
4798
4799							*(float*)(source0 + 4 * x) = c0;
4800						}
4801
4802						source0 += pitch;
4803						source1 += pitch;
4804						source2 += pitch;
4805						source3 += pitch;
4806						source4 += pitch;
4807						source5 += pitch;
4808						source6 += pitch;
4809						source7 += pitch;
4810						source8 += pitch;
4811						source9 += pitch;
4812						sourceA += pitch;
4813						sourceB += pitch;
4814						sourceC += pitch;
4815						sourceD += pitch;
4816						sourceE += pitch;
4817						sourceF += pitch;
4818					}
4819				}
4820				else ASSERT(false);
4821			}
4822		}
4823		else if(internal.format == FORMAT_G32R32F)
4824		{
4825			if(CPUID::supportsSSE() && (width % 2) == 0)
4826			{
4827				if(internal.depth == 2)
4828				{
4829					for(int y = 0; y < height; y++)
4830					{
4831						for(int x = 0; x < width; x += 2)
4832						{
4833							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4834							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4835
4836							c0 = _mm_add_ps(c0, c1);
4837							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
4838
4839							_mm_store_ps((float*)(source0 + 8 * x), c0);
4840						}
4841
4842						source0 += pitch;
4843						source1 += pitch;
4844					}
4845				}
4846				else if(internal.depth == 4)
4847				{
4848					for(int y = 0; y < height; y++)
4849					{
4850						for(int x = 0; x < width; x += 2)
4851						{
4852							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4853							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4854							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4855							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4856
4857							c0 = _mm_add_ps(c0, c1);
4858							c2 = _mm_add_ps(c2, c3);
4859							c0 = _mm_add_ps(c0, c2);
4860							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
4861
4862							_mm_store_ps((float*)(source0 + 8 * x), c0);
4863						}
4864
4865						source0 += pitch;
4866						source1 += pitch;
4867						source2 += pitch;
4868						source3 += pitch;
4869					}
4870				}
4871				else if(internal.depth == 8)
4872				{
4873					for(int y = 0; y < height; y++)
4874					{
4875						for(int x = 0; x < width; x += 2)
4876						{
4877							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4878							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4879							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4880							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4881							__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
4882							__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
4883							__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
4884							__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
4885
4886							c0 = _mm_add_ps(c0, c1);
4887							c2 = _mm_add_ps(c2, c3);
4888							c4 = _mm_add_ps(c4, c5);
4889							c6 = _mm_add_ps(c6, c7);
4890							c0 = _mm_add_ps(c0, c2);
4891							c4 = _mm_add_ps(c4, c6);
4892							c0 = _mm_add_ps(c0, c4);
4893							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
4894
4895							_mm_store_ps((float*)(source0 + 8 * x), c0);
4896						}
4897
4898						source0 += pitch;
4899						source1 += pitch;
4900						source2 += pitch;
4901						source3 += pitch;
4902						source4 += pitch;
4903						source5 += pitch;
4904						source6 += pitch;
4905						source7 += pitch;
4906					}
4907				}
4908				else if(internal.depth == 16)
4909				{
4910					for(int y = 0; y < height; y++)
4911					{
4912						for(int x = 0; x < width; x += 2)
4913						{
4914							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4915							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4916							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4917							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4918							__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
4919							__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
4920							__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
4921							__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
4922							__m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
4923							__m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
4924							__m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
4925							__m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
4926							__m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
4927							__m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
4928							__m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
4929							__m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
4930
4931							c0 = _mm_add_ps(c0, c1);
4932							c2 = _mm_add_ps(c2, c3);
4933							c4 = _mm_add_ps(c4, c5);
4934							c6 = _mm_add_ps(c6, c7);
4935							c8 = _mm_add_ps(c8, c9);
4936							cA = _mm_add_ps(cA, cB);
4937							cC = _mm_add_ps(cC, cD);
4938							cE = _mm_add_ps(cE, cF);
4939							c0 = _mm_add_ps(c0, c2);
4940							c4 = _mm_add_ps(c4, c6);
4941							c8 = _mm_add_ps(c8, cA);
4942							cC = _mm_add_ps(cC, cE);
4943							c0 = _mm_add_ps(c0, c4);
4944							c8 = _mm_add_ps(c8, cC);
4945							c0 = _mm_add_ps(c0, c8);
4946							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
4947
4948							_mm_store_ps((float*)(source0 + 8 * x), c0);
4949						}
4950
4951						source0 += pitch;
4952						source1 += pitch;
4953						source2 += pitch;
4954						source3 += pitch;
4955						source4 += pitch;
4956						source5 += pitch;
4957						source6 += pitch;
4958						source7 += pitch;
4959						source8 += pitch;
4960						source9 += pitch;
4961						sourceA += pitch;
4962						sourceB += pitch;
4963						sourceC += pitch;
4964						sourceD += pitch;
4965						sourceE += pitch;
4966						sourceF += pitch;
4967					}
4968				}
4969				else ASSERT(false);
4970			}
4971			else
4972			{
4973				if(internal.depth == 2)
4974				{
4975					for(int y = 0; y < height; y++)
4976					{
4977						for(int x = 0; x < 2 * width; x++)
4978						{
4979							float c0 = *(float*)(source0 + 4 * x);
4980							float c1 = *(float*)(source1 + 4 * x);
4981
4982							c0 = c0 + c1;
4983							c0 *= 1.0f / 2.0f;
4984
4985							*(float*)(source0 + 4 * x) = c0;
4986						}
4987
4988						source0 += pitch;
4989						source1 += pitch;
4990					}
4991				}
4992				else if(internal.depth == 4)
4993				{
4994					for(int y = 0; y < height; y++)
4995					{
4996						for(int x = 0; x < 2 * width; x++)
4997						{
4998							float c0 = *(float*)(source0 + 4 * x);
4999							float c1 = *(float*)(source1 + 4 * x);
5000							float c2 = *(float*)(source2 + 4 * x);
5001							float c3 = *(float*)(source3 + 4 * x);
5002
5003							c0 = c0 + c1;
5004							c2 = c2 + c3;
5005							c0 = c0 + c2;
5006							c0 *= 1.0f / 4.0f;
5007
5008							*(float*)(source0 + 4 * x) = c0;
5009						}
5010
5011						source0 += pitch;
5012						source1 += pitch;
5013						source2 += pitch;
5014						source3 += pitch;
5015					}
5016				}
5017				else if(internal.depth == 8)
5018				{
5019					for(int y = 0; y < height; y++)
5020					{
5021						for(int x = 0; x < 2 * width; x++)
5022						{
5023							float c0 = *(float*)(source0 + 4 * x);
5024							float c1 = *(float*)(source1 + 4 * x);
5025							float c2 = *(float*)(source2 + 4 * x);
5026							float c3 = *(float*)(source3 + 4 * x);
5027							float c4 = *(float*)(source4 + 4 * x);
5028							float c5 = *(float*)(source5 + 4 * x);
5029							float c6 = *(float*)(source6 + 4 * x);
5030							float c7 = *(float*)(source7 + 4 * x);
5031
5032							c0 = c0 + c1;
5033							c2 = c2 + c3;
5034							c4 = c4 + c5;
5035							c6 = c6 + c7;
5036							c0 = c0 + c2;
5037							c4 = c4 + c6;
5038							c0 = c0 + c4;
5039							c0 *= 1.0f / 8.0f;
5040
5041							*(float*)(source0 + 4 * x) = c0;
5042						}
5043
5044						source0 += pitch;
5045						source1 += pitch;
5046						source2 += pitch;
5047						source3 += pitch;
5048						source4 += pitch;
5049						source5 += pitch;
5050						source6 += pitch;
5051						source7 += pitch;
5052					}
5053				}
5054				else if(internal.depth == 16)
5055				{
5056					for(int y = 0; y < height; y++)
5057					{
5058						for(int x = 0; x < 2 * width; x++)
5059						{
5060							float c0 = *(float*)(source0 + 4 * x);
5061							float c1 = *(float*)(source1 + 4 * x);
5062							float c2 = *(float*)(source2 + 4 * x);
5063							float c3 = *(float*)(source3 + 4 * x);
5064							float c4 = *(float*)(source4 + 4 * x);
5065							float c5 = *(float*)(source5 + 4 * x);
5066							float c6 = *(float*)(source6 + 4 * x);
5067							float c7 = *(float*)(source7 + 4 * x);
5068							float c8 = *(float*)(source8 + 4 * x);
5069							float c9 = *(float*)(source9 + 4 * x);
5070							float cA = *(float*)(sourceA + 4 * x);
5071							float cB = *(float*)(sourceB + 4 * x);
5072							float cC = *(float*)(sourceC + 4 * x);
5073							float cD = *(float*)(sourceD + 4 * x);
5074							float cE = *(float*)(sourceE + 4 * x);
5075							float cF = *(float*)(sourceF + 4 * x);
5076
5077							c0 = c0 + c1;
5078							c2 = c2 + c3;
5079							c4 = c4 + c5;
5080							c6 = c6 + c7;
5081							c8 = c8 + c9;
5082							cA = cA + cB;
5083							cC = cC + cD;
5084							cE = cE + cF;
5085							c0 = c0 + c2;
5086							c4 = c4 + c6;
5087							c8 = c8 + cA;
5088							cC = cC + cE;
5089							c0 = c0 + c4;
5090							c8 = c8 + cC;
5091							c0 = c0 + c8;
5092							c0 *= 1.0f / 16.0f;
5093
5094							*(float*)(source0 + 4 * x) = c0;
5095						}
5096
5097						source0 += pitch;
5098						source1 += pitch;
5099						source2 += pitch;
5100						source3 += pitch;
5101						source4 += pitch;
5102						source5 += pitch;
5103						source6 += pitch;
5104						source7 += pitch;
5105						source8 += pitch;
5106						source9 += pitch;
5107						sourceA += pitch;
5108						sourceB += pitch;
5109						sourceC += pitch;
5110						sourceD += pitch;
5111						sourceE += pitch;
5112						sourceF += pitch;
5113					}
5114				}
5115				else ASSERT(false);
5116			}
5117		}
5118		else if(internal.format == FORMAT_A32B32G32R32F)
5119		{
5120			if(CPUID::supportsSSE())
5121			{
5122				if(internal.depth == 2)
5123				{
5124					for(int y = 0; y < height; y++)
5125					{
5126						for(int x = 0; x < width; x++)
5127						{
5128							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5129							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5130
5131							c0 = _mm_add_ps(c0, c1);
5132							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
5133
5134							_mm_store_ps((float*)(source0 + 16 * x), c0);
5135						}
5136
5137						source0 += pitch;
5138						source1 += pitch;
5139					}
5140				}
5141				else if(internal.depth == 4)
5142				{
5143					for(int y = 0; y < height; y++)
5144					{
5145						for(int x = 0; x < width; x++)
5146						{
5147							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5148							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5149							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5150							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5151
5152							c0 = _mm_add_ps(c0, c1);
5153							c2 = _mm_add_ps(c2, c3);
5154							c0 = _mm_add_ps(c0, c2);
5155							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
5156
5157							_mm_store_ps((float*)(source0 + 16 * x), c0);
5158						}
5159
5160						source0 += pitch;
5161						source1 += pitch;
5162						source2 += pitch;
5163						source3 += pitch;
5164					}
5165				}
5166				else if(internal.depth == 8)
5167				{
5168					for(int y = 0; y < height; y++)
5169					{
5170						for(int x = 0; x < width; x++)
5171						{
5172							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5173							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5174							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5175							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5176							__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
5177							__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
5178							__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
5179							__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
5180
5181							c0 = _mm_add_ps(c0, c1);
5182							c2 = _mm_add_ps(c2, c3);
5183							c4 = _mm_add_ps(c4, c5);
5184							c6 = _mm_add_ps(c6, c7);
5185							c0 = _mm_add_ps(c0, c2);
5186							c4 = _mm_add_ps(c4, c6);
5187							c0 = _mm_add_ps(c0, c4);
5188							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
5189
5190							_mm_store_ps((float*)(source0 + 16 * x), c0);
5191						}
5192
5193						source0 += pitch;
5194						source1 += pitch;
5195						source2 += pitch;
5196						source3 += pitch;
5197						source4 += pitch;
5198						source5 += pitch;
5199						source6 += pitch;
5200						source7 += pitch;
5201					}
5202				}
5203				else if(internal.depth == 16)
5204				{
5205					for(int y = 0; y < height; y++)
5206					{
5207						for(int x = 0; x < width; x++)
5208						{
5209							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5210							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5211							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5212							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5213							__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
5214							__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
5215							__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
5216							__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
5217							__m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
5218							__m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
5219							__m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
5220							__m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
5221							__m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
5222							__m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
5223							__m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
5224							__m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
5225
5226							c0 = _mm_add_ps(c0, c1);
5227							c2 = _mm_add_ps(c2, c3);
5228							c4 = _mm_add_ps(c4, c5);
5229							c6 = _mm_add_ps(c6, c7);
5230							c8 = _mm_add_ps(c8, c9);
5231							cA = _mm_add_ps(cA, cB);
5232							cC = _mm_add_ps(cC, cD);
5233							cE = _mm_add_ps(cE, cF);
5234							c0 = _mm_add_ps(c0, c2);
5235							c4 = _mm_add_ps(c4, c6);
5236							c8 = _mm_add_ps(c8, cA);
5237							cC = _mm_add_ps(cC, cE);
5238							c0 = _mm_add_ps(c0, c4);
5239							c8 = _mm_add_ps(c8, cC);
5240							c0 = _mm_add_ps(c0, c8);
5241							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5242
5243							_mm_store_ps((float*)(source0 + 16 * x), c0);
5244						}
5245
5246						source0 += pitch;
5247						source1 += pitch;
5248						source2 += pitch;
5249						source3 += pitch;
5250						source4 += pitch;
5251						source5 += pitch;
5252						source6 += pitch;
5253						source7 += pitch;
5254						source8 += pitch;
5255						source9 += pitch;
5256						sourceA += pitch;
5257						sourceB += pitch;
5258						sourceC += pitch;
5259						sourceD += pitch;
5260						sourceE += pitch;
5261						sourceF += pitch;
5262					}
5263				}
5264				else ASSERT(false);
5265			}
5266			else
5267			{
5268				if(internal.depth == 2)
5269				{
5270					for(int y = 0; y < height; y++)
5271					{
5272						for(int x = 0; x < 4 * width; x++)
5273						{
5274							float c0 = *(float*)(source0 + 4 * x);
5275							float c1 = *(float*)(source1 + 4 * x);
5276
5277							c0 = c0 + c1;
5278							c0 *= 1.0f / 2.0f;
5279
5280							*(float*)(source0 + 4 * x) = c0;
5281						}
5282
5283						source0 += pitch;
5284						source1 += pitch;
5285					}
5286				}
5287				else if(internal.depth == 4)
5288				{
5289					for(int y = 0; y < height; y++)
5290					{
5291						for(int x = 0; x < 4 * width; x++)
5292						{
5293							float c0 = *(float*)(source0 + 4 * x);
5294							float c1 = *(float*)(source1 + 4 * x);
5295							float c2 = *(float*)(source2 + 4 * x);
5296							float c3 = *(float*)(source3 + 4 * x);
5297
5298							c0 = c0 + c1;
5299							c2 = c2 + c3;
5300							c0 = c0 + c2;
5301							c0 *= 1.0f / 4.0f;
5302
5303							*(float*)(source0 + 4 * x) = c0;
5304						}
5305
5306						source0 += pitch;
5307						source1 += pitch;
5308						source2 += pitch;
5309						source3 += pitch;
5310					}
5311				}
5312				else if(internal.depth == 8)
5313				{
5314					for(int y = 0; y < height; y++)
5315					{
5316						for(int x = 0; x < 4 * width; x++)
5317						{
5318							float c0 = *(float*)(source0 + 4 * x);
5319							float c1 = *(float*)(source1 + 4 * x);
5320							float c2 = *(float*)(source2 + 4 * x);
5321							float c3 = *(float*)(source3 + 4 * x);
5322							float c4 = *(float*)(source4 + 4 * x);
5323							float c5 = *(float*)(source5 + 4 * x);
5324							float c6 = *(float*)(source6 + 4 * x);
5325							float c7 = *(float*)(source7 + 4 * x);
5326
5327							c0 = c0 + c1;
5328							c2 = c2 + c3;
5329							c4 = c4 + c5;
5330							c6 = c6 + c7;
5331							c0 = c0 + c2;
5332							c4 = c4 + c6;
5333							c0 = c0 + c4;
5334							c0 *= 1.0f / 8.0f;
5335
5336							*(float*)(source0 + 4 * x) = c0;
5337						}
5338
5339						source0 += pitch;
5340						source1 += pitch;
5341						source2 += pitch;
5342						source3 += pitch;
5343						source4 += pitch;
5344						source5 += pitch;
5345						source6 += pitch;
5346						source7 += pitch;
5347					}
5348				}
5349				else if(internal.depth == 16)
5350				{
5351					for(int y = 0; y < height; y++)
5352					{
5353						for(int x = 0; x < 4 * width; x++)
5354						{
5355							float c0 = *(float*)(source0 + 4 * x);
5356							float c1 = *(float*)(source1 + 4 * x);
5357							float c2 = *(float*)(source2 + 4 * x);
5358							float c3 = *(float*)(source3 + 4 * x);
5359							float c4 = *(float*)(source4 + 4 * x);
5360							float c5 = *(float*)(source5 + 4 * x);
5361							float c6 = *(float*)(source6 + 4 * x);
5362							float c7 = *(float*)(source7 + 4 * x);
5363							float c8 = *(float*)(source8 + 4 * x);
5364							float c9 = *(float*)(source9 + 4 * x);
5365							float cA = *(float*)(sourceA + 4 * x);
5366							float cB = *(float*)(sourceB + 4 * x);
5367							float cC = *(float*)(sourceC + 4 * x);
5368							float cD = *(float*)(sourceD + 4 * x);
5369							float cE = *(float*)(sourceE + 4 * x);
5370							float cF = *(float*)(sourceF + 4 * x);
5371
5372							c0 = c0 + c1;
5373							c2 = c2 + c3;
5374							c4 = c4 + c5;
5375							c6 = c6 + c7;
5376							c8 = c8 + c9;
5377							cA = cA + cB;
5378							cC = cC + cD;
5379							cE = cE + cF;
5380							c0 = c0 + c2;
5381							c4 = c4 + c6;
5382							c8 = c8 + cA;
5383							cC = cC + cE;
5384							c0 = c0 + c4;
5385							c8 = c8 + cC;
5386							c0 = c0 + c8;
5387							c0 *= 1.0f / 16.0f;
5388
5389							*(float*)(source0 + 4 * x) = c0;
5390						}
5391
5392						source0 += pitch;
5393						source1 += pitch;
5394						source2 += pitch;
5395						source3 += pitch;
5396						source4 += pitch;
5397						source5 += pitch;
5398						source6 += pitch;
5399						source7 += pitch;
5400						source8 += pitch;
5401						source9 += pitch;
5402						sourceA += pitch;
5403						sourceB += pitch;
5404						sourceC += pitch;
5405						sourceD += pitch;
5406						sourceE += pitch;
5407						sourceF += pitch;
5408					}
5409				}
5410				else ASSERT(false);
5411			}
5412		}
5413		else if(internal.format == FORMAT_R5G6B5)
5414		{
5415			if(CPUID::supportsSSE2() && (width % 8) == 0)
5416			{
5417				if(internal.depth == 2)
5418				{
5419					for(int y = 0; y < height; y++)
5420					{
5421						for(int x = 0; x < width; x += 8)
5422						{
5423							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5424							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5425
5426							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5427							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5428							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5429							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5430							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5431							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5432
5433							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5434							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5435							c1 = _mm_avg_epu16(c0__g_, c1__g_);
5436							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5437							c0 = _mm_or_si128(c0, c1);
5438
5439							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5440						}
5441
5442						source0 += pitch;
5443						source1 += pitch;
5444					}
5445				}
5446				else if(internal.depth == 4)
5447				{
5448					for(int y = 0; y < height; y++)
5449					{
5450						for(int x = 0; x < width; x += 8)
5451						{
5452							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5453							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5454							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5455							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5456
5457							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5458							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5459							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5460							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5461							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5462							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5463							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5464							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5465							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5466							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5467
5468							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5469							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5470							c0 = _mm_avg_epu8(c0, c2);
5471							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5472							c1 = _mm_avg_epu16(c0__g_, c1__g_);
5473							c3 = _mm_avg_epu16(c2__g_, c3__g_);
5474							c1 = _mm_avg_epu16(c1, c3);
5475							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5476							c0 = _mm_or_si128(c0, c1);
5477
5478							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5479						}
5480
5481						source0 += pitch;
5482						source1 += pitch;
5483						source2 += pitch;
5484						source3 += pitch;
5485					}
5486				}
5487				else if(internal.depth == 8)
5488				{
5489					for(int y = 0; y < height; y++)
5490					{
5491						for(int x = 0; x < width; x += 8)
5492						{
5493							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5494							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5495							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5496							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5497							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
5498							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
5499							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
5500							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
5501
5502							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5503							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5504							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5505							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5506							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5507							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5508							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5509							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5510							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5511							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5512							__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
5513							__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
5514							__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
5515							__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
5516							__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
5517							__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
5518							__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
5519							__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
5520
5521							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5522							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5523							c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
5524							c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
5525							c0 = _mm_avg_epu8(c0, c2);
5526							c4 = _mm_avg_epu8(c4, c6);
5527							c0 = _mm_avg_epu8(c0, c4);
5528							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5529							c1 = _mm_avg_epu16(c0__g_, c1__g_);
5530							c3 = _mm_avg_epu16(c2__g_, c3__g_);
5531							c5 = _mm_avg_epu16(c4__g_, c5__g_);
5532							c7 = _mm_avg_epu16(c6__g_, c7__g_);
5533							c1 = _mm_avg_epu16(c1, c3);
5534							c5 = _mm_avg_epu16(c5, c7);
5535							c1 = _mm_avg_epu16(c1, c5);
5536							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5537							c0 = _mm_or_si128(c0, c1);
5538
5539							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5540						}
5541
5542						source0 += pitch;
5543						source1 += pitch;
5544						source2 += pitch;
5545						source3 += pitch;
5546						source4 += pitch;
5547						source5 += pitch;
5548						source6 += pitch;
5549						source7 += pitch;
5550					}
5551				}
5552				else if(internal.depth == 16)
5553				{
5554					for(int y = 0; y < height; y++)
5555					{
5556						for(int x = 0; x < width; x += 8)
5557						{
5558							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5559							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5560							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5561							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5562							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
5563							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
5564							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
5565							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
5566							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 2 * x));
5567							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 2 * x));
5568							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 2 * x));
5569							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 2 * x));
5570							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 2 * x));
5571							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 2 * x));
5572							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 2 * x));
5573							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 2 * x));
5574
5575							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5576							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5577							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5578							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5579							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5580							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5581							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5582							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5583							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5584							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5585							__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
5586							__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
5587							__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
5588							__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
5589							__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
5590							__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
5591							__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
5592							__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
5593							__m128i c8_r_b = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(r_b));
5594							__m128i c8__g_ = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(_g_));
5595							__m128i c9_r_b = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(r_b));
5596							__m128i c9__g_ = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(_g_));
5597							__m128i cA_r_b = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(r_b));
5598							__m128i cA__g_ = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(_g_));
5599							__m128i cB_r_b = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(r_b));
5600							__m128i cB__g_ = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(_g_));
5601							__m128i cC_r_b = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(r_b));
5602							__m128i cC__g_ = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(_g_));
5603							__m128i cD_r_b = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(r_b));
5604							__m128i cD__g_ = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(_g_));
5605							__m128i cE_r_b = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(r_b));
5606							__m128i cE__g_ = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(_g_));
5607							__m128i cF_r_b = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(r_b));
5608							__m128i cF__g_ = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(_g_));
5609
5610							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5611							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5612							c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
5613							c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
5614							c8 = _mm_avg_epu8(c8_r_b, c9_r_b);
5615							cA = _mm_avg_epu8(cA_r_b, cB_r_b);
5616							cC = _mm_avg_epu8(cC_r_b, cD_r_b);
5617							cE = _mm_avg_epu8(cE_r_b, cF_r_b);
5618							c0 = _mm_avg_epu8(c0, c2);
5619							c4 = _mm_avg_epu8(c4, c6);
5620							c8 = _mm_avg_epu8(c8, cA);
5621							cC = _mm_avg_epu8(cC, cE);
5622							c0 = _mm_avg_epu8(c0, c4);
5623							c8 = _mm_avg_epu8(c8, cC);
5624							c0 = _mm_avg_epu8(c0, c8);
5625							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5626							c1 = _mm_avg_epu16(c0__g_, c1__g_);
5627							c3 = _mm_avg_epu16(c2__g_, c3__g_);
5628							c5 = _mm_avg_epu16(c4__g_, c5__g_);
5629							c7 = _mm_avg_epu16(c6__g_, c7__g_);
5630							c9 = _mm_avg_epu16(c8__g_, c9__g_);
5631							cB = _mm_avg_epu16(cA__g_, cB__g_);
5632							cD = _mm_avg_epu16(cC__g_, cD__g_);
5633							cF = _mm_avg_epu16(cE__g_, cF__g_);
5634							c1 = _mm_avg_epu8(c1, c3);
5635							c5 = _mm_avg_epu8(c5, c7);
5636							c9 = _mm_avg_epu8(c9, cB);
5637							cD = _mm_avg_epu8(cD, cF);
5638							c1 = _mm_avg_epu8(c1, c5);
5639							c9 = _mm_avg_epu8(c9, cD);
5640							c1 = _mm_avg_epu8(c1, c9);
5641							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5642							c0 = _mm_or_si128(c0, c1);
5643
5644							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5645						}
5646
5647						source0 += pitch;
5648						source1 += pitch;
5649						source2 += pitch;
5650						source3 += pitch;
5651						source4 += pitch;
5652						source5 += pitch;
5653						source6 += pitch;
5654						source7 += pitch;
5655						source8 += pitch;
5656						source9 += pitch;
5657						sourceA += pitch;
5658						sourceB += pitch;
5659						sourceC += pitch;
5660						sourceD += pitch;
5661						sourceE += pitch;
5662						sourceF += pitch;
5663					}
5664				}
5665				else ASSERT(false);
5666			}
5667			else
5668			{
5669				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7BEF) + (((x) ^ (y)) & 0x0821))
5670
5671				if(internal.depth == 2)
5672				{
5673					for(int y = 0; y < height; y++)
5674					{
5675						for(int x = 0; x < width; x++)
5676						{
5677							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5678							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5679
5680							c0 = AVERAGE(c0, c1);
5681
5682							*(unsigned short*)(source0 + 2 * x) = c0;
5683						}
5684
5685						source0 += pitch;
5686						source1 += pitch;
5687					}
5688				}
5689				else if(internal.depth == 4)
5690				{
5691					for(int y = 0; y < height; y++)
5692					{
5693						for(int x = 0; x < width; x++)
5694						{
5695							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5696							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5697							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
5698							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
5699
5700							c0 = AVERAGE(c0, c1);
5701							c2 = AVERAGE(c2, c3);
5702							c0 = AVERAGE(c0, c2);
5703
5704							*(unsigned short*)(source0 + 2 * x) = c0;
5705						}
5706
5707						source0 += pitch;
5708						source1 += pitch;
5709						source2 += pitch;
5710						source3 += pitch;
5711					}
5712				}
5713				else if(internal.depth == 8)
5714				{
5715					for(int y = 0; y < height; y++)
5716					{
5717						for(int x = 0; x < width; x++)
5718						{
5719							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5720							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5721							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
5722							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
5723							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
5724							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
5725							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
5726							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
5727
5728							c0 = AVERAGE(c0, c1);
5729							c2 = AVERAGE(c2, c3);
5730							c4 = AVERAGE(c4, c5);
5731							c6 = AVERAGE(c6, c7);
5732							c0 = AVERAGE(c0, c2);
5733							c4 = AVERAGE(c4, c6);
5734							c0 = AVERAGE(c0, c4);
5735
5736							*(unsigned short*)(source0 + 2 * x) = c0;
5737						}
5738
5739						source0 += pitch;
5740						source1 += pitch;
5741						source2 += pitch;
5742						source3 += pitch;
5743						source4 += pitch;
5744						source5 += pitch;
5745						source6 += pitch;
5746						source7 += pitch;
5747					}
5748				}
5749				else if(internal.depth == 16)
5750				{
5751					for(int y = 0; y < height; y++)
5752					{
5753						for(int x = 0; x < width; x++)
5754						{
5755							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5756							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5757							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
5758							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
5759							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
5760							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
5761							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
5762							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
5763							unsigned short c8 = *(unsigned short*)(source8 + 2 * x);
5764							unsigned short c9 = *(unsigned short*)(source9 + 2 * x);
5765							unsigned short cA = *(unsigned short*)(sourceA + 2 * x);
5766							unsigned short cB = *(unsigned short*)(sourceB + 2 * x);
5767							unsigned short cC = *(unsigned short*)(sourceC + 2 * x);
5768							unsigned short cD = *(unsigned short*)(sourceD + 2 * x);
5769							unsigned short cE = *(unsigned short*)(sourceE + 2 * x);
5770							unsigned short cF = *(unsigned short*)(sourceF + 2 * x);
5771
5772							c0 = AVERAGE(c0, c1);
5773							c2 = AVERAGE(c2, c3);
5774							c4 = AVERAGE(c4, c5);
5775							c6 = AVERAGE(c6, c7);
5776							c8 = AVERAGE(c8, c9);
5777							cA = AVERAGE(cA, cB);
5778							cC = AVERAGE(cC, cD);
5779							cE = AVERAGE(cE, cF);
5780							c0 = AVERAGE(c0, c2);
5781							c4 = AVERAGE(c4, c6);
5782							c8 = AVERAGE(c8, cA);
5783							cC = AVERAGE(cC, cE);
5784							c0 = AVERAGE(c0, c4);
5785							c8 = AVERAGE(c8, cC);
5786							c0 = AVERAGE(c0, c8);
5787
5788							*(unsigned short*)(source0 + 2 * x) = c0;
5789						}
5790
5791						source0 += pitch;
5792						source1 += pitch;
5793						source2 += pitch;
5794						source3 += pitch;
5795						source4 += pitch;
5796						source5 += pitch;
5797						source6 += pitch;
5798						source7 += pitch;
5799						source8 += pitch;
5800						source9 += pitch;
5801						sourceA += pitch;
5802						sourceB += pitch;
5803						sourceC += pitch;
5804						sourceD += pitch;
5805						sourceE += pitch;
5806						sourceF += pitch;
5807					}
5808				}
5809				else ASSERT(false);
5810
5811				#undef AVERAGE
5812			}
5813		}
5814		else
5815		{
5816		//	UNIMPLEMENTED();
5817		}
5818	}
5819}
5820