1// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//    http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include "PixelRoutine.hpp"
16
17#include "Renderer.hpp"
18#include "QuadRasterizer.hpp"
19#include "Surface.hpp"
20#include "Primitive.hpp"
21#include "SamplerCore.hpp"
22#include "Constants.hpp"
23#include "Debug.hpp"
24
25namespace sw
26{
27	extern bool complementaryDepthBuffer;
28	extern bool postBlendSRGB;
29	extern bool exactColorRounding;
30	extern bool forceClearRegisters;
31
32	PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader) : QuadRasterizer(state, shader), v(shader && shader->dynamicallyIndexedInput)
33	{
34		if(!shader || shader->getVersion() < 0x0200 || forceClearRegisters)
35		{
36			for(int i = 0; i < MAX_FRAGMENT_INPUTS; i++)
37			{
38				v[i].x = Float4(0.0f);
39				v[i].y = Float4(0.0f);
40				v[i].z = Float4(0.0f);
41				v[i].w = Float4(0.0f);
42			}
43		}
44	}
45
46	PixelRoutine::~PixelRoutine()
47	{
48		for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
49		{
50			delete sampler[i];
51		}
52	}
53
54	void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
55	{
56		#if PERF_PROFILE
57			Long pipeTime = Ticks();
58		#endif
59
60		for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
61		{
62			sampler[i] = new SamplerCore(constants, state.sampler[i]);
63		}
64
65		const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
66
67		Int zMask[4];   // Depth mask
68		Int sMask[4];   // Stencil mask
69
70		for(unsigned int q = 0; q < state.multiSample; q++)
71		{
72			zMask[q] = cMask[q];
73			sMask[q] = cMask[q];
74		}
75
76		for(unsigned int q = 0; q < state.multiSample; q++)
77		{
78			stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
79		}
80
81		Float4 f;
82		Float4 rhwCentroid;
83
84		Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
85
86		if(interpolateZ())
87		{
88			for(unsigned int q = 0; q < state.multiSample; q++)
89			{
90				Float4 x = xxxx;
91
92				if(state.multiSample > 1)
93				{
94					x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4));
95				}
96
97				z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false);
98			}
99		}
100
101		Bool depthPass = false;
102
103		if(earlyDepthTest)
104		{
105			for(unsigned int q = 0; q < state.multiSample; q++)
106			{
107				depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
108			}
109		}
110
111		If(depthPass || Bool(!earlyDepthTest))
112		{
113			#if PERF_PROFILE
114				Long interpTime = Ticks();
115			#endif
116
117			Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
118
119			// Centroid locations
120			Float4 XXXX = Float4(0.0f);
121			Float4 YYYY = Float4(0.0f);
122
123			if(state.centroid)
124			{
125				Float4 WWWW(1.0e-9f);
126
127				for(unsigned int q = 0; q < state.multiSample; q++)
128				{
129					XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
130					YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
131					WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]);
132				}
133
134				WWWW = Rcp_pp(WWWW);
135				XXXX *= WWWW;
136				YYYY *= WWWW;
137
138				XXXX += xxxx;
139				YYYY += yyyy;
140			}
141
142			if(interpolateW())
143			{
144				w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false);
145				rhw = reciprocal(w, false, false, true);
146
147				if(state.centroid)
148				{
149					rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false));
150				}
151			}
152
153			for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
154			{
155				for(int component = 0; component < 4; component++)
156				{
157					if(state.interpolant[interpolant].component & (1 << component))
158					{
159						if(!state.interpolant[interpolant].centroid)
160						{
161							v[interpolant][component] = interpolate(xxxx, Dv[interpolant][component], rhw, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
162						}
163						else
164						{
165							v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
166						}
167					}
168				}
169
170				Float4 rcp;
171
172				switch(state.interpolant[interpolant].project)
173				{
174				case 0:
175					break;
176				case 1:
177					rcp = reciprocal(v[interpolant].y);
178					v[interpolant].x = v[interpolant].x * rcp;
179					break;
180				case 2:
181					rcp = reciprocal(v[interpolant].z);
182					v[interpolant].x = v[interpolant].x * rcp;
183					v[interpolant].y = v[interpolant].y * rcp;
184					break;
185				case 3:
186					rcp = reciprocal(v[interpolant].w);
187					v[interpolant].x = v[interpolant].x * rcp;
188					v[interpolant].y = v[interpolant].y * rcp;
189					v[interpolant].z = v[interpolant].z * rcp;
190					break;
191				}
192			}
193
194			if(state.fog.component)
195			{
196				f = interpolate(xxxx, Df, rhw, primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective);
197			}
198
199			setBuiltins(x, y, z, w);
200
201			#if PERF_PROFILE
202				cycles[PERF_INTERP] += Ticks() - interpTime;
203			#endif
204
205			Bool alphaPass = true;
206
207			if(colorUsed())
208			{
209				#if PERF_PROFILE
210					Long shaderTime = Ticks();
211				#endif
212
213				applyShader(cMask);
214
215				#if PERF_PROFILE
216					cycles[PERF_SHADER] += Ticks() - shaderTime;
217				#endif
218
219				alphaPass = alphaTest(cMask);
220
221				if((shader && shader->containsKill()) || state.alphaTestActive())
222				{
223					for(unsigned int q = 0; q < state.multiSample; q++)
224					{
225						zMask[q] &= cMask[q];
226						sMask[q] &= cMask[q];
227					}
228				}
229			}
230
231			If(alphaPass)
232			{
233				if(!earlyDepthTest)
234				{
235					for(unsigned int q = 0; q < state.multiSample; q++)
236					{
237						depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
238					}
239				}
240
241				#if PERF_PROFILE
242					Long ropTime = Ticks();
243				#endif
244
245				If(depthPass || Bool(earlyDepthTest))
246				{
247					for(unsigned int q = 0; q < state.multiSample; q++)
248					{
249						if(state.multiSampleMask & (1 << q))
250						{
251							writeDepth(zBuffer, q, x, z[q], zMask[q]);
252
253							if(state.occlusionEnabled)
254							{
255								occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
256							}
257						}
258					}
259
260					if(colorUsed())
261					{
262						#if PERF_PROFILE
263							AddAtomic(Pointer<Long>(&profiler.ropOperations), 4);
264						#endif
265
266						rasterOperation(f, cBuffer, x, sMask, zMask, cMask);
267					}
268				}
269
270				#if PERF_PROFILE
271					cycles[PERF_ROP] += Ticks() - ropTime;
272				#endif
273			}
274		}
275
276		for(unsigned int q = 0; q < state.multiSample; q++)
277		{
278			if(state.multiSampleMask & (1 << q))
279			{
280				writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
281			}
282		}
283
284		#if PERF_PROFILE
285			cycles[PERF_PIPE] += Ticks() - pipeTime;
286		#endif
287	}
288
289	Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
290	{
291		Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
292
293		if(!flat)
294		{
295			interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
296			               y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
297
298			if(perspective)
299			{
300				interpolant *= rhw;
301			}
302		}
303
304		return interpolant;
305	}
306
307	void PixelRoutine::stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
308	{
309		if(!state.stencilActive)
310		{
311			return;
312		}
313
314		// (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
315
316		Pointer<Byte> buffer = sBuffer + 2 * x;
317
318		if(q > 0)
319		{
320			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
321		}
322
323		Byte8 value = *Pointer<Byte8>(buffer);
324		Byte8 valueCCW = value;
325
326		if(!state.noStencilMask)
327		{
328			value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ));
329		}
330
331		stencilTest(value, state.stencilCompareMode, false);
332
333		if(state.twoSidedStencil)
334		{
335			if(!state.noStencilMaskCCW)
336			{
337				valueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ));
338			}
339
340			stencilTest(valueCCW, state.stencilCompareModeCCW, true);
341
342			value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
343			valueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
344			value |= valueCCW;
345		}
346
347		sMask = SignMask(value) & cMask;
348	}
349
350	void PixelRoutine::stencilTest(Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW)
351	{
352		Byte8 equal;
353
354		switch(stencilCompareMode)
355		{
356		case STENCIL_ALWAYS:
357			value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
358			break;
359		case STENCIL_NEVER:
360			value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
361			break;
362		case STENCIL_LESS:			// a < b ~ b > a
363			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
364			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
365			break;
366		case STENCIL_EQUAL:
367			value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
368			break;
369		case STENCIL_NOTEQUAL:		// a != b ~ !(a == b)
370			value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
371			value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
372			break;
373		case STENCIL_LESSEQUAL:	// a <= b ~ (b > a) || (a == b)
374			equal = value;
375			equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
376			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
377			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
378			value |= equal;
379			break;
380		case STENCIL_GREATER:		// a > b
381			equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
382			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
383			equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
384			value = equal;
385			break;
386		case STENCIL_GREATEREQUAL:	// a >= b ~ !(a < b) ~ !(b > a)
387			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
388			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
389			value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
390			break;
391		default:
392			ASSERT(false);
393		}
394	}
395
396	Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
397	{
398		if(!state.depthTestActive)
399		{
400			return true;
401		}
402
403		Float4 Z = z;
404
405		if(shader && shader->depthOverride())
406		{
407			if(complementaryDepthBuffer)
408			{
409				Z = Float4(1.0f) - oDepth;
410			}
411			else
412			{
413				Z = oDepth;
414			}
415		}
416
417		Pointer<Byte> buffer;
418		Int pitch;
419
420		if(!state.quadLayoutDepthBuffer)
421		{
422			buffer = zBuffer + 4 * x;
423			pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
424		}
425		else
426		{
427			buffer = zBuffer + 8 * x;
428		}
429
430		if(q > 0)
431		{
432			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
433		}
434
435		Float4 zValue;
436
437		if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
438		{
439			if(!state.quadLayoutDepthBuffer)
440			{
441				// FIXME: Properly optimizes?
442				zValue.xy = *Pointer<Float4>(buffer);
443				zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
444			}
445			else
446			{
447				zValue = *Pointer<Float4>(buffer, 16);
448			}
449		}
450
451		Int4 zTest;
452
453		switch(state.depthCompareMode)
454		{
455		case DEPTH_ALWAYS:
456			// Optimized
457			break;
458		case DEPTH_NEVER:
459			// Optimized
460			break;
461		case DEPTH_EQUAL:
462			zTest = CmpEQ(zValue, Z);
463			break;
464		case DEPTH_NOTEQUAL:
465			zTest = CmpNEQ(zValue, Z);
466			break;
467		case DEPTH_LESS:
468			if(complementaryDepthBuffer)
469			{
470				zTest = CmpLT(zValue, Z);
471			}
472			else
473			{
474				zTest = CmpNLE(zValue, Z);
475			}
476			break;
477		case DEPTH_GREATEREQUAL:
478			if(complementaryDepthBuffer)
479			{
480				zTest = CmpNLT(zValue, Z);
481			}
482			else
483			{
484				zTest = CmpLE(zValue, Z);
485			}
486			break;
487		case DEPTH_LESSEQUAL:
488			if(complementaryDepthBuffer)
489			{
490				zTest = CmpLE(zValue, Z);
491			}
492			else
493			{
494				zTest = CmpNLT(zValue, Z);
495			}
496			break;
497		case DEPTH_GREATER:
498			if(complementaryDepthBuffer)
499			{
500				zTest = CmpNLE(zValue, Z);
501			}
502			else
503			{
504				zTest = CmpLT(zValue, Z);
505			}
506			break;
507		default:
508			ASSERT(false);
509		}
510
511		switch(state.depthCompareMode)
512		{
513		case DEPTH_ALWAYS:
514			zMask = cMask;
515			break;
516		case DEPTH_NEVER:
517			zMask = 0x0;
518			break;
519		default:
520			zMask = SignMask(zTest) & cMask;
521			break;
522		}
523
524		if(state.stencilActive)
525		{
526			zMask &= sMask;
527		}
528
529		return zMask != 0;
530	}
531
532	void PixelRoutine::alphaTest(Int &aMask, Short4 &alpha)
533	{
534		Short4 cmp;
535		Short4 equal;
536
537		switch(state.alphaCompareMode)
538		{
539		case ALPHA_ALWAYS:
540			aMask = 0xF;
541			break;
542		case ALPHA_NEVER:
543			aMask = 0x0;
544			break;
545		case ALPHA_EQUAL:
546			cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
547			aMask = SignMask(Pack(cmp, Short4(0x0000)));
548			break;
549		case ALPHA_NOTEQUAL:       // a != b ~ !(a == b)
550			cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
551			aMask = SignMask(Pack(cmp, Short4(0x0000)));
552			break;
553		case ALPHA_LESS:           // a < b ~ b > a
554			cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha);
555			aMask = SignMask(Pack(cmp, Short4(0x0000)));
556			break;
557		case ALPHA_GREATEREQUAL:   // a >= b ~ (a > b) || (a == b) ~ !(b > a)   // TODO: Approximate
558			equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
559			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
560			cmp |= equal;
561			aMask = SignMask(Pack(cmp, Short4(0x0000)));
562			break;
563		case ALPHA_LESSEQUAL:      // a <= b ~ !(a > b)
564			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
565			aMask = SignMask(Pack(cmp, Short4(0x0000)));
566			break;
567		case ALPHA_GREATER:        // a > b
568			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
569			aMask = SignMask(Pack(cmp, Short4(0x0000)));
570			break;
571		default:
572			ASSERT(false);
573		}
574	}
575
576	void PixelRoutine::alphaToCoverage(Int cMask[4], Float4 &alpha)
577	{
578		Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0)));
579		Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1)));
580		Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2)));
581		Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3)));
582
583		Int aMask0 = SignMask(coverage0);
584		Int aMask1 = SignMask(coverage1);
585		Int aMask2 = SignMask(coverage2);
586		Int aMask3 = SignMask(coverage3);
587
588		cMask[0] &= aMask0;
589		cMask[1] &= aMask1;
590		cMask[2] &= aMask2;
591		cMask[3] &= aMask3;
592	}
593
594	void PixelRoutine::fogBlend(Vector4f &c0, Float4 &fog)
595	{
596		if(!state.fogActive)
597		{
598			return;
599		}
600
601		if(state.pixelFogMode != FOG_NONE)
602		{
603			pixelFog(fog);
604
605			fog = Min(fog, Float4(1.0f));
606			fog = Max(fog, Float4(0.0f));
607		}
608
609		c0.x -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
610		c0.y -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
611		c0.z -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
612
613		c0.x *= fog;
614		c0.y *= fog;
615		c0.z *= fog;
616
617		c0.x += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
618		c0.y += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
619		c0.z += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
620	}
621
622	void PixelRoutine::pixelFog(Float4 &visibility)
623	{
624		Float4 &zw = visibility;
625
626		if(state.pixelFogMode != FOG_NONE)
627		{
628			if(state.wBasedFog)
629			{
630				zw = rhw;
631			}
632			else
633			{
634				if(complementaryDepthBuffer)
635				{
636					zw = Float4(1.0f) - z[0];
637				}
638				else
639				{
640					zw = z[0];
641				}
642			}
643		}
644
645		switch(state.pixelFogMode)
646		{
647		case FOG_NONE:
648			break;
649		case FOG_LINEAR:
650			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.scale));
651			zw += *Pointer<Float4>(data + OFFSET(DrawData,fog.offset));
652			break;
653		case FOG_EXP:
654			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.densityE));
655			zw = exponential2(zw, true);
656			break;
657		case FOG_EXP2:
658			zw *= zw;
659			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.density2E));
660			zw = exponential2(zw, true);
661			break;
662		default:
663			ASSERT(false);
664		}
665	}
666
667	void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
668	{
669		if(!state.depthWriteEnable)
670		{
671			return;
672		}
673
674		Float4 Z = z;
675
676		if(shader && shader->depthOverride())
677		{
678			if(complementaryDepthBuffer)
679			{
680				Z = Float4(1.0f) - oDepth;
681			}
682			else
683			{
684				Z = oDepth;
685			}
686		}
687
688		Pointer<Byte> buffer;
689		Int pitch;
690
691		if(!state.quadLayoutDepthBuffer)
692		{
693			buffer = zBuffer + 4 * x;
694			pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
695		}
696		else
697		{
698			buffer = zBuffer + 8 * x;
699		}
700
701		if(q > 0)
702		{
703			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
704		}
705
706		Float4 zValue;
707
708		if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
709		{
710			if(!state.quadLayoutDepthBuffer)
711			{
712				// FIXME: Properly optimizes?
713				zValue.xy = *Pointer<Float4>(buffer);
714				zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
715			}
716			else
717			{
718				zValue = *Pointer<Float4>(buffer, 16);
719			}
720		}
721
722		Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
723		zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
724		Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
725
726		if(!state.quadLayoutDepthBuffer)
727		{
728			// FIXME: Properly optimizes?
729			*Pointer<Float2>(buffer) = Float2(Z.xy);
730			*Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
731		}
732		else
733		{
734			*Pointer<Float4>(buffer, 16) = Z;
735		}
736	}
737
738	void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
739	{
740		if(!state.stencilActive)
741		{
742			return;
743		}
744
745		if(state.stencilPassOperation == OPERATION_KEEP && state.stencilZFailOperation == OPERATION_KEEP && state.stencilFailOperation == OPERATION_KEEP)
746		{
747			if(!state.twoSidedStencil || (state.stencilPassOperationCCW == OPERATION_KEEP && state.stencilZFailOperationCCW == OPERATION_KEEP && state.stencilFailOperationCCW == OPERATION_KEEP))
748			{
749				return;
750			}
751		}
752
753		if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW))
754		{
755			return;
756		}
757
758		Pointer<Byte> buffer = sBuffer + 2 * x;
759
760		if(q > 0)
761		{
762			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
763		}
764
765		Byte8 bufferValue = *Pointer<Byte8>(buffer);
766
767		Byte8 newValue;
768		stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
769
770		if(!state.noStencilWriteMask)
771		{
772			Byte8 maskedValue = bufferValue;
773			newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ));
774			maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
775			newValue |= maskedValue;
776		}
777
778		if(state.twoSidedStencil)
779		{
780			Byte8 newValueCCW;
781
782			stencilOperation(newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
783
784			if(!state.noStencilWriteMaskCCW)
785			{
786				Byte8 maskedValue = bufferValue;
787				newValueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ));
788				maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
789				newValueCCW |= maskedValue;
790			}
791
792			newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
793			newValueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
794			newValue |= newValueCCW;
795		}
796
797		newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
798		bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
799		newValue |= bufferValue;
800
801		*Pointer<Byte4>(buffer) = Byte4(newValue);
802	}
803
804	void PixelRoutine::stencilOperation(Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
805	{
806		Byte8 &pass = newValue;
807		Byte8 fail;
808		Byte8 zFail;
809
810		stencilOperation(pass, bufferValue, stencilPassOperation, CCW);
811
812		if(stencilZFailOperation != stencilPassOperation)
813		{
814			stencilOperation(zFail, bufferValue, stencilZFailOperation, CCW);
815		}
816
817		if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
818		{
819			stencilOperation(fail, bufferValue, stencilFailOperation, CCW);
820		}
821
822		if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
823		{
824			if(state.depthTestActive && stencilZFailOperation != stencilPassOperation)   // zMask valid and values not the same
825			{
826				pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
827				zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
828				pass |= zFail;
829			}
830
831			pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
832			fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
833			pass |= fail;
834		}
835	}
836
837	void PixelRoutine::stencilOperation(Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW)
838	{
839		switch(operation)
840		{
841		case OPERATION_KEEP:
842			output = bufferValue;
843			break;
844		case OPERATION_ZERO:
845			output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
846			break;
847		case OPERATION_REPLACE:
848			output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceQ));
849			break;
850		case OPERATION_INCRSAT:
851			output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
852			break;
853		case OPERATION_DECRSAT:
854			output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
855			break;
856		case OPERATION_INVERT:
857			output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
858			break;
859		case OPERATION_INCR:
860			output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
861			break;
862		case OPERATION_DECR:
863			output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
864			break;
865		default:
866			ASSERT(false);
867		}
868	}
869
870	void PixelRoutine::blendFactor(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorActive)
871	{
872		switch(blendFactorActive)
873		{
874		case BLEND_ZERO:
875			// Optimized
876			break;
877		case BLEND_ONE:
878			// Optimized
879			break;
880		case BLEND_SOURCE:
881			blendFactor.x = current.x;
882			blendFactor.y = current.y;
883			blendFactor.z = current.z;
884			break;
885		case BLEND_INVSOURCE:
886			blendFactor.x = Short4(0xFFFFu) - current.x;
887			blendFactor.y = Short4(0xFFFFu) - current.y;
888			blendFactor.z = Short4(0xFFFFu) - current.z;
889			break;
890		case BLEND_DEST:
891			blendFactor.x = pixel.x;
892			blendFactor.y = pixel.y;
893			blendFactor.z = pixel.z;
894			break;
895		case BLEND_INVDEST:
896			blendFactor.x = Short4(0xFFFFu) - pixel.x;
897			blendFactor.y = Short4(0xFFFFu) - pixel.y;
898			blendFactor.z = Short4(0xFFFFu) - pixel.z;
899			break;
900		case BLEND_SOURCEALPHA:
901			blendFactor.x = current.w;
902			blendFactor.y = current.w;
903			blendFactor.z = current.w;
904			break;
905		case BLEND_INVSOURCEALPHA:
906			blendFactor.x = Short4(0xFFFFu) - current.w;
907			blendFactor.y = Short4(0xFFFFu) - current.w;
908			blendFactor.z = Short4(0xFFFFu) - current.w;
909			break;
910		case BLEND_DESTALPHA:
911			blendFactor.x = pixel.w;
912			blendFactor.y = pixel.w;
913			blendFactor.z = pixel.w;
914			break;
915		case BLEND_INVDESTALPHA:
916			blendFactor.x = Short4(0xFFFFu) - pixel.w;
917			blendFactor.y = Short4(0xFFFFu) - pixel.w;
918			blendFactor.z = Short4(0xFFFFu) - pixel.w;
919			break;
920		case BLEND_SRCALPHASAT:
921			blendFactor.x = Short4(0xFFFFu) - pixel.w;
922			blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
923			blendFactor.y = blendFactor.x;
924			blendFactor.z = blendFactor.x;
925			break;
926		case BLEND_CONSTANT:
927			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0]));
928			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1]));
929			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2]));
930			break;
931		case BLEND_INVCONSTANT:
932			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
933			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
934			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
935			break;
936		case BLEND_CONSTANTALPHA:
937			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
938			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
939			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
940			break;
941		case BLEND_INVCONSTANTALPHA:
942			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
943			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
944			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
945			break;
946		default:
947			ASSERT(false);
948		}
949	}
950
951	void PixelRoutine::blendFactorAlpha(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
952	{
953		switch(blendFactorAlphaActive)
954		{
955		case BLEND_ZERO:
956			// Optimized
957			break;
958		case BLEND_ONE:
959			// Optimized
960			break;
961		case BLEND_SOURCE:
962			blendFactor.w = current.w;
963			break;
964		case BLEND_INVSOURCE:
965			blendFactor.w = Short4(0xFFFFu) - current.w;
966			break;
967		case BLEND_DEST:
968			blendFactor.w = pixel.w;
969			break;
970		case BLEND_INVDEST:
971			blendFactor.w = Short4(0xFFFFu) - pixel.w;
972			break;
973		case BLEND_SOURCEALPHA:
974			blendFactor.w = current.w;
975			break;
976		case BLEND_INVSOURCEALPHA:
977			blendFactor.w = Short4(0xFFFFu) - current.w;
978			break;
979		case BLEND_DESTALPHA:
980			blendFactor.w = pixel.w;
981			break;
982		case BLEND_INVDESTALPHA:
983			blendFactor.w = Short4(0xFFFFu) - pixel.w;
984			break;
985		case BLEND_SRCALPHASAT:
986			blendFactor.w = Short4(0xFFFFu);
987			break;
988		case BLEND_CONSTANT:
989		case BLEND_CONSTANTALPHA:
990			blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
991			break;
992		case BLEND_INVCONSTANT:
993		case BLEND_INVCONSTANTALPHA:
994			blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
995			break;
996		default:
997			ASSERT(false);
998		}
999	}
1000
1001	bool PixelRoutine::isSRGB(int index) const
1002	{
1003		return state.targetFormat[index] == FORMAT_SRGB8_A8 || state.targetFormat[index] == FORMAT_SRGB8_X8;
1004	}
1005
1006	void PixelRoutine::readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
1007	{
1008		Short4 c01;
1009		Short4 c23;
1010		Pointer<Byte> buffer;
1011		Pointer<Byte> buffer2;
1012
1013		switch(state.targetFormat[index])
1014		{
1015		case FORMAT_R5G6B5:
1016			buffer = cBuffer + 2 * x;
1017			buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1018			c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1019
1020			pixel.x = c01 & Short4(0xF800u);
1021			pixel.y = (c01 & Short4(0x07E0u)) << 5;
1022			pixel.z = (c01 & Short4(0x001Fu)) << 11;
1023			pixel.w = Short4(0xFFFFu);
1024			break;
1025		case FORMAT_A8R8G8B8:
1026			buffer = cBuffer + 4 * x;
1027			c01 = *Pointer<Short4>(buffer);
1028			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1029			c23 = *Pointer<Short4>(buffer);
1030			pixel.z = c01;
1031			pixel.y = c01;
1032			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1033			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1034			pixel.x = pixel.z;
1035			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1036			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1037			pixel.y = pixel.z;
1038			pixel.w = pixel.x;
1039			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1040			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1041			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1042			pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1043			break;
1044		case FORMAT_A8B8G8R8:
1045		case FORMAT_SRGB8_A8:
1046			buffer = cBuffer + 4 * x;
1047			c01 = *Pointer<Short4>(buffer);
1048			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1049			c23 = *Pointer<Short4>(buffer);
1050			pixel.z = c01;
1051			pixel.y = c01;
1052			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1053			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1054			pixel.x = pixel.z;
1055			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1056			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1057			pixel.y = pixel.z;
1058			pixel.w = pixel.x;
1059			pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1060			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1061			pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1062			pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1063			break;
1064		case FORMAT_A8:
1065			buffer = cBuffer + 1 * x;
1066			pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0);
1067			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1068			pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1);
1069			pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1070			pixel.x = Short4(0x0000);
1071			pixel.y = Short4(0x0000);
1072			pixel.z = Short4(0x0000);
1073			break;
1074		case FORMAT_X8R8G8B8:
1075			buffer = cBuffer + 4 * x;
1076			c01 = *Pointer<Short4>(buffer);
1077			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1078			c23 = *Pointer<Short4>(buffer);
1079			pixel.z = c01;
1080			pixel.y = c01;
1081			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1082			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1083			pixel.x = pixel.z;
1084			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1085			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1086			pixel.y = pixel.z;
1087			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1088			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1089			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1090			pixel.w = Short4(0xFFFFu);
1091			break;
1092		case FORMAT_X8B8G8R8:
1093		case FORMAT_SRGB8_X8:
1094			buffer = cBuffer + 4 * x;
1095			c01 = *Pointer<Short4>(buffer);
1096			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1097			c23 = *Pointer<Short4>(buffer);
1098			pixel.z = c01;
1099			pixel.y = c01;
1100			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1101			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1102			pixel.x = pixel.z;
1103			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1104			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1105			pixel.y = pixel.z;
1106			pixel.w = pixel.x;
1107			pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1108			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1109			pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1110			pixel.w = Short4(0xFFFFu);
1111			break;
1112		case FORMAT_A8G8R8B8Q:
1113			UNIMPLEMENTED();
1114		//	pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1115		//	pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1116		//	pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1117		//	pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1118			break;
1119		case FORMAT_X8G8R8B8Q:
1120			UNIMPLEMENTED();
1121		//	pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1122		//	pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1123		//	pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1124		//	pixel.w = Short4(0xFFFFu);
1125			break;
1126		case FORMAT_A16B16G16R16:
1127			buffer = cBuffer;
1128			pixel.x = *Pointer<Short4>(buffer + 8 * x);
1129			pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
1130			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1131			pixel.z = *Pointer<Short4>(buffer + 8 * x);
1132			pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
1133			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1134			break;
1135		case FORMAT_G16R16:
1136			buffer = cBuffer;
1137			pixel.x = *Pointer<Short4>(buffer + 4 * x);
1138			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1139			pixel.y = *Pointer<Short4>(buffer + 4 * x);
1140			pixel.z = pixel.x;
1141			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
1142			pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
1143			pixel.y = pixel.z;
1144			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
1145			pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
1146			pixel.z = Short4(0xFFFFu);
1147			pixel.w = Short4(0xFFFFu);
1148			break;
1149		default:
1150			ASSERT(false);
1151		}
1152
1153		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
1154		{
1155			sRGBtoLinear16_12_16(pixel);
1156		}
1157	}
1158
1159	void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1160	{
1161		if(!state.alphaBlendActive)
1162		{
1163			return;
1164		}
1165
1166		Vector4s pixel;
1167		readPixel(index, cBuffer, x, pixel);
1168
1169		// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1170		Vector4s sourceFactor;
1171		Vector4s destFactor;
1172
1173		blendFactor(sourceFactor, current, pixel, state.sourceBlendFactor);
1174		blendFactor(destFactor, current, pixel, state.destBlendFactor);
1175
1176		if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
1177		{
1178			current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
1179			current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
1180			current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
1181		}
1182
1183		if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
1184		{
1185			pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
1186			pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
1187			pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
1188		}
1189
1190		switch(state.blendOperation)
1191		{
1192		case BLENDOP_ADD:
1193			current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1194			current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1195			current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1196			break;
1197		case BLENDOP_SUB:
1198			current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1199			current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1200			current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1201			break;
1202		case BLENDOP_INVSUB:
1203			current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
1204			current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
1205			current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
1206			break;
1207		case BLENDOP_MIN:
1208			current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
1209			current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
1210			current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
1211			break;
1212		case BLENDOP_MAX:
1213			current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
1214			current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
1215			current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
1216			break;
1217		case BLENDOP_SOURCE:
1218			// No operation
1219			break;
1220		case BLENDOP_DEST:
1221			current.x = pixel.x;
1222			current.y = pixel.y;
1223			current.z = pixel.z;
1224			break;
1225		case BLENDOP_NULL:
1226			current.x = Short4(0x0000);
1227			current.y = Short4(0x0000);
1228			current.z = Short4(0x0000);
1229			break;
1230		default:
1231			ASSERT(false);
1232		}
1233
1234		blendFactorAlpha(sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
1235		blendFactorAlpha(destFactor, current, pixel, state.destBlendFactorAlpha);
1236
1237		if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
1238		{
1239			current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
1240		}
1241
1242		if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
1243		{
1244			pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
1245		}
1246
1247		switch(state.blendOperationAlpha)
1248		{
1249		case BLENDOP_ADD:
1250			current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1251			break;
1252		case BLENDOP_SUB:
1253			current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1254			break;
1255		case BLENDOP_INVSUB:
1256			current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
1257			break;
1258		case BLENDOP_MIN:
1259			current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
1260			break;
1261		case BLENDOP_MAX:
1262			current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
1263			break;
1264		case BLENDOP_SOURCE:
1265			// No operation
1266			break;
1267		case BLENDOP_DEST:
1268			current.w = pixel.w;
1269			break;
1270		case BLENDOP_NULL:
1271			current.w = Short4(0x0000);
1272			break;
1273		default:
1274			ASSERT(false);
1275		}
1276	}
1277
1278	void PixelRoutine::logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1279	{
1280		if(state.logicalOperation == LOGICALOP_COPY)
1281		{
1282			return;
1283		}
1284
1285		Vector4s pixel;
1286		readPixel(index, cBuffer, x, pixel);
1287
1288		switch(state.logicalOperation)
1289		{
1290		case LOGICALOP_CLEAR:
1291			current.x = UShort4(0);
1292			current.y = UShort4(0);
1293			current.z = UShort4(0);
1294			break;
1295		case LOGICALOP_SET:
1296			current.x = UShort4(0xFFFFu);
1297			current.y = UShort4(0xFFFFu);
1298			current.z = UShort4(0xFFFFu);
1299			break;
1300		case LOGICALOP_COPY:
1301			ASSERT(false);   // Optimized out
1302			break;
1303		case LOGICALOP_COPY_INVERTED:
1304			current.x = ~current.x;
1305			current.y = ~current.y;
1306			current.z = ~current.z;
1307			break;
1308		case LOGICALOP_NOOP:
1309			current.x = pixel.x;
1310			current.y = pixel.y;
1311			current.z = pixel.z;
1312			break;
1313		case LOGICALOP_INVERT:
1314			current.x = ~pixel.x;
1315			current.y = ~pixel.y;
1316			current.z = ~pixel.z;
1317			break;
1318		case LOGICALOP_AND:
1319			current.x = pixel.x & current.x;
1320			current.y = pixel.y & current.y;
1321			current.z = pixel.z & current.z;
1322			break;
1323		case LOGICALOP_NAND:
1324			current.x = ~(pixel.x & current.x);
1325			current.y = ~(pixel.y & current.y);
1326			current.z = ~(pixel.z & current.z);
1327			break;
1328		case LOGICALOP_OR:
1329			current.x = pixel.x | current.x;
1330			current.y = pixel.y | current.y;
1331			current.z = pixel.z | current.z;
1332			break;
1333		case LOGICALOP_NOR:
1334			current.x = ~(pixel.x | current.x);
1335			current.y = ~(pixel.y | current.y);
1336			current.z = ~(pixel.z | current.z);
1337			break;
1338		case LOGICALOP_XOR:
1339			current.x = pixel.x ^ current.x;
1340			current.y = pixel.y ^ current.y;
1341			current.z = pixel.z ^ current.z;
1342			break;
1343		case LOGICALOP_EQUIV:
1344			current.x = ~(pixel.x ^ current.x);
1345			current.y = ~(pixel.y ^ current.y);
1346			current.z = ~(pixel.z ^ current.z);
1347			break;
1348		case LOGICALOP_AND_REVERSE:
1349			current.x = ~pixel.x & current.x;
1350			current.y = ~pixel.y & current.y;
1351			current.z = ~pixel.z & current.z;
1352			break;
1353		case LOGICALOP_AND_INVERTED:
1354			current.x = pixel.x & ~current.x;
1355			current.y = pixel.y & ~current.y;
1356			current.z = pixel.z & ~current.z;
1357			break;
1358		case LOGICALOP_OR_REVERSE:
1359			current.x = ~pixel.x | current.x;
1360			current.y = ~pixel.y | current.y;
1361			current.z = ~pixel.z | current.z;
1362			break;
1363		case LOGICALOP_OR_INVERTED:
1364			current.x = pixel.x | ~current.x;
1365			current.y = pixel.y | ~current.y;
1366			current.z = pixel.z | ~current.z;
1367			break;
1368		default:
1369			ASSERT(false);
1370		}
1371	}
1372
1373	void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &current, Int &sMask, Int &zMask, Int &cMask)
1374	{
1375		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
1376		{
1377			linearToSRGB16_12_16(current);
1378		}
1379
1380		if(exactColorRounding)
1381		{
1382			switch(state.targetFormat[index])
1383			{
1384			case FORMAT_R5G6B5:
1385				current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
1386				current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200));
1387				current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400));
1388				break;
1389			case FORMAT_X8G8R8B8Q:
1390			case FORMAT_A8G8R8B8Q:
1391			case FORMAT_X8R8G8B8:
1392			case FORMAT_X8B8G8R8:
1393			case FORMAT_A8R8G8B8:
1394			case FORMAT_A8B8G8R8:
1395			case FORMAT_SRGB8_X8:
1396			case FORMAT_SRGB8_A8:
1397			case FORMAT_G8R8:
1398			case FORMAT_R8:
1399				current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
1400				current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
1401				current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
1402				current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
1403				break;
1404			default:
1405				break;
1406			}
1407		}
1408
1409		int rgbaWriteMask = state.colorWriteActive(index);
1410		int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1411
1412		switch(state.targetFormat[index])
1413		{
1414		case FORMAT_R5G6B5:
1415			{
1416				current.x = current.x & Short4(0xF800u);
1417				current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1418				current.z = As<UShort4>(current.z) >> 11;
1419
1420				current.x = current.x | current.y | current.z;
1421			}
1422			break;
1423		case FORMAT_X8G8R8B8Q:
1424			UNIMPLEMENTED();
1425		//	current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1426		//	current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1427		//	current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1428
1429		//	current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1430		//	current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1431			break;
1432		case FORMAT_A8G8R8B8Q:
1433			UNIMPLEMENTED();
1434		//	current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1435		//	current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1436		//	current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1437		//	current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1438
1439		//	current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1440		//	current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1441			break;
1442		case FORMAT_X8R8G8B8:
1443		case FORMAT_A8R8G8B8:
1444			if(state.targetFormat[index] == FORMAT_X8R8G8B8 || rgbaWriteMask == 0x7)
1445			{
1446				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1447				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1448				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1449
1450				current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1451				current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1452
1453				current.x = current.z;
1454				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1455				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1456				current.y = current.z;
1457				current.z = As<Short4>(UnpackLow(current.z, current.x));
1458				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1459			}
1460			else
1461			{
1462				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1463				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1464				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1465				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1466
1467				current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1468				current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1469
1470				current.x = current.z;
1471				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1472				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1473				current.y = current.z;
1474				current.z = As<Short4>(UnpackLow(current.z, current.x));
1475				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1476			}
1477			break;
1478		case FORMAT_X8B8G8R8:
1479		case FORMAT_A8B8G8R8:
1480		case FORMAT_SRGB8_X8:
1481		case FORMAT_SRGB8_A8:
1482			if(state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8 || rgbaWriteMask == 0x7)
1483			{
1484				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1485				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1486				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1487
1488				current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1489				current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1490
1491				current.x = current.z;
1492				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1493				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1494				current.y = current.z;
1495				current.z = As<Short4>(UnpackLow(current.z, current.x));
1496				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1497			}
1498			else
1499			{
1500				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1501				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1502				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1503				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1504
1505				current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1506				current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1507
1508				current.x = current.z;
1509				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1510				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1511				current.y = current.z;
1512				current.z = As<Short4>(UnpackLow(current.z, current.x));
1513				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1514			}
1515			break;
1516		case FORMAT_G8R8:
1517			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1518			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1519			current.x = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.x)));
1520			current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1521			current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
1522			break;
1523		case FORMAT_R8:
1524			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1525			current.x = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.x)));
1526			break;
1527		case FORMAT_A8:
1528			current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1529			current.w = As<Short4>(Pack(As<UShort4>(current.w), As<UShort4>(current.w)));
1530			break;
1531		case FORMAT_G16R16:
1532			current.z = current.x;
1533			current.x = As<Short4>(UnpackLow(current.x, current.y));
1534			current.z = As<Short4>(UnpackHigh(current.z, current.y));
1535			current.y = current.z;
1536			break;
1537		case FORMAT_A16B16G16R16:
1538			transpose4x4(current.x, current.y, current.z, current.w);
1539			break;
1540		default:
1541			ASSERT(false);
1542		}
1543
1544		Short4 c01 = current.z;
1545		Short4 c23 = current.y;
1546
1547		Int xMask;   // Combination of all masks
1548
1549		if(state.depthTestActive)
1550		{
1551			xMask = zMask;
1552		}
1553		else
1554		{
1555			xMask = cMask;
1556		}
1557
1558		if(state.stencilActive)
1559		{
1560			xMask &= sMask;
1561		}
1562
1563		switch(state.targetFormat[index])
1564		{
1565		case FORMAT_R5G6B5:
1566			{
1567				Pointer<Byte> buffer = cBuffer + 2 * x;
1568				Int value = *Pointer<Int>(buffer);
1569
1570				Int c01 = Extract(As<Int2>(current.x), 0);
1571
1572				if((bgraWriteMask & 0x00000007) != 0x00000007)
1573				{
1574					Int masked = value;
1575					c01 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1576					masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
1577					c01 |= masked;
1578				}
1579
1580				c01 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
1581				value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
1582				c01 |= value;
1583				*Pointer<Int>(buffer) = c01;
1584
1585				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1586				value = *Pointer<Int>(buffer);
1587
1588				Int c23 = Extract(As<Int2>(current.x), 1);
1589
1590				if((bgraWriteMask & 0x00000007) != 0x00000007)
1591				{
1592					Int masked = value;
1593					c23 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1594					masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
1595					c23 |= masked;
1596				}
1597
1598				c23 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
1599				value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
1600				c23 |= value;
1601				*Pointer<Int>(buffer) = c23;
1602			}
1603			break;
1604		case FORMAT_A8G8R8B8Q:
1605		case FORMAT_X8G8R8B8Q:   // FIXME: Don't touch alpha?
1606			UNIMPLEMENTED();
1607		//	value = *Pointer<Short4>(cBuffer + 8 * x + 0);
1608
1609		//	if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1610		//	   ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1611		//	    (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1612		//	{
1613		//		Short4 masked = value;
1614		//		c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1615		//		masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1616		//		c01 |= masked;
1617		//	}
1618
1619		//	c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1620		//	value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1621		//	c01 |= value;
1622		//	*Pointer<Short4>(cBuffer + 8 * x + 0) = c01;
1623
1624		//	value = *Pointer<Short4>(cBuffer + 8 * x + 8);
1625
1626		//	if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1627		//	   ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1628		//	    (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1629		//	{
1630		//		Short4 masked = value;
1631		//		c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1632		//		masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1633		//		c23 |= masked;
1634		//	}
1635
1636		//	c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1637		//	value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1638		//	c23 |= value;
1639		//	*Pointer<Short4>(cBuffer + 8 * x + 8) = c23;
1640			break;
1641		case FORMAT_A8R8G8B8:
1642		case FORMAT_X8R8G8B8:   // FIXME: Don't touch alpha?
1643			{
1644				Pointer<Byte> buffer = cBuffer + x * 4;
1645				Short4 value = *Pointer<Short4>(buffer);
1646
1647				if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1648				   ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1649					(state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1650				{
1651					Short4 masked = value;
1652					c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1653					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1654					c01 |= masked;
1655				}
1656
1657				c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1658				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1659				c01 |= value;
1660				*Pointer<Short4>(buffer) = c01;
1661
1662				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1663				value = *Pointer<Short4>(buffer);
1664
1665				if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1666				   ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1667					(state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1668				{
1669					Short4 masked = value;
1670					c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1671					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1672					c23 |= masked;
1673				}
1674
1675				c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1676				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1677				c23 |= value;
1678				*Pointer<Short4>(buffer) = c23;
1679			}
1680			break;
1681		case FORMAT_A8B8G8R8:
1682		case FORMAT_X8B8G8R8:   // FIXME: Don't touch alpha?
1683		case FORMAT_SRGB8_X8:
1684		case FORMAT_SRGB8_A8:
1685			{
1686				Pointer<Byte> buffer = cBuffer + x * 4;
1687				Short4 value = *Pointer<Short4>(buffer);
1688
1689				bool masked = (((state.targetFormat[index] == FORMAT_A8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_A8) && rgbaWriteMask != 0x0000000F) ||
1690				              (((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x00000007) &&
1691				               ((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x0000000F))); // FIXME: Need for masking when XBGR && Fh?
1692
1693				if(masked)
1694				{
1695					Short4 masked = value;
1696					c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1697					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1698					c01 |= masked;
1699				}
1700
1701				c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1702				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1703				c01 |= value;
1704				*Pointer<Short4>(buffer) = c01;
1705
1706				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1707				value = *Pointer<Short4>(buffer);
1708
1709				if(masked)
1710				{
1711					Short4 masked = value;
1712					c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1713					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1714					c23 |= masked;
1715				}
1716
1717				c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1718				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1719				c23 |= value;
1720				*Pointer<Short4>(buffer) = c23;
1721			}
1722			break;
1723		case FORMAT_G8R8:
1724			if((rgbaWriteMask & 0x00000003) != 0x0)
1725			{
1726				Pointer<Byte> buffer = cBuffer + 2 * x;
1727				Int2 value;
1728				value = Insert(value, *Pointer<Int>(buffer), 0);
1729				Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1730				value = Insert(value, *Pointer<Int>(buffer + pitch), 1);
1731
1732				Int2 packedCol = As<Int2>(current.x);
1733
1734				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
1735				if((rgbaWriteMask & 0x3) != 0x3)
1736				{
1737					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
1738					UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
1739					mergedMask &= rgbaMask;
1740				}
1741
1742				packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
1743
1744				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
1745				*Pointer<UInt>(buffer + pitch) = As<UInt>(Extract(packedCol, 1));
1746			}
1747			break;
1748		case FORMAT_R8:
1749			if(rgbaWriteMask & 0x00000001)
1750			{
1751				Pointer<Byte> buffer = cBuffer + 1 * x;
1752				Short4 value;
1753				value = Insert(value, *Pointer<Short>(buffer), 0);
1754				Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1755				value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
1756				value = UnpackLow(As<Byte8>(value), As<Byte8>(value));
1757
1758				current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
1759				value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
1760				current.x |= value;
1761
1762				*Pointer<Short>(buffer) = Extract(current.x, 0);
1763				*Pointer<Short>(buffer + pitch) = Extract(current.x, 1);
1764			}
1765			break;
1766		case FORMAT_A8:
1767			if(rgbaWriteMask & 0x00000008)
1768			{
1769				Pointer<Byte> buffer = cBuffer + 1 * x;
1770				Short4 value;
1771				value = Insert(value, *Pointer<Short>(buffer), 0);
1772				Int pitch = *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1773				value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
1774				value = UnpackLow(As<Byte8>(value), As<Byte8>(value));
1775
1776				current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q) + 8 * xMask);
1777				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask);
1778				current.w |= value;
1779
1780				*Pointer<Short>(buffer) = Extract(current.w, 0);
1781				*Pointer<Short>(buffer + pitch) = Extract(current.w, 1);
1782			}
1783			break;
1784		case FORMAT_G16R16:
1785			{
1786				Pointer<Byte> buffer = cBuffer + 4 * x;
1787
1788				Short4 value = *Pointer<Short4>(buffer);
1789
1790				if((rgbaWriteMask & 0x00000003) != 0x00000003)
1791				{
1792					Short4 masked = value;
1793					current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1794					masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
1795					current.x |= masked;
1796				}
1797
1798				current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1799				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1800				current.x |= value;
1801				*Pointer<Short4>(buffer) = current.x;
1802
1803				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1804
1805				value = *Pointer<Short4>(buffer);
1806
1807				if((rgbaWriteMask & 0x00000003) != 0x00000003)
1808				{
1809					Short4 masked = value;
1810					current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1811					masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
1812					current.y |= masked;
1813				}
1814
1815				current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1816				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1817				current.y |= value;
1818				*Pointer<Short4>(buffer) = current.y;
1819			}
1820			break;
1821		case FORMAT_A16B16G16R16:
1822			{
1823				Pointer<Byte> buffer = cBuffer + 8 * x;
1824
1825				{
1826					Short4 value = *Pointer<Short4>(buffer);
1827
1828					if(rgbaWriteMask != 0x0000000F)
1829					{
1830						Short4 masked = value;
1831						current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1832						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1833						current.x |= masked;
1834					}
1835
1836					current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
1837					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
1838					current.x |= value;
1839					*Pointer<Short4>(buffer) = current.x;
1840				}
1841
1842				{
1843					Short4 value = *Pointer<Short4>(buffer + 8);
1844
1845					if(rgbaWriteMask != 0x0000000F)
1846					{
1847						Short4 masked = value;
1848						current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1849						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1850						current.y |= masked;
1851					}
1852
1853					current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
1854					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
1855					current.y |= value;
1856					*Pointer<Short4>(buffer + 8) = current.y;
1857				}
1858
1859				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1860
1861				{
1862					Short4 value = *Pointer<Short4>(buffer);
1863
1864					if(rgbaWriteMask != 0x0000000F)
1865					{
1866						Short4 masked = value;
1867						current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1868						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1869						current.z |= masked;
1870					}
1871
1872					current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
1873					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
1874					current.z |= value;
1875					*Pointer<Short4>(buffer) = current.z;
1876				}
1877
1878				{
1879					Short4 value = *Pointer<Short4>(buffer + 8);
1880
1881					if(rgbaWriteMask != 0x0000000F)
1882					{
1883						Short4 masked = value;
1884						current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1885						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1886						current.w |= masked;
1887					}
1888
1889					current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
1890					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
1891					current.w |= value;
1892					*Pointer<Short4>(buffer + 8) = current.w;
1893				}
1894			}
1895			break;
1896		default:
1897			ASSERT(false);
1898		}
1899	}
1900
1901	void PixelRoutine::blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive)
1902	{
1903		switch(blendFactorActive)
1904		{
1905		case BLEND_ZERO:
1906			// Optimized
1907			break;
1908		case BLEND_ONE:
1909			// Optimized
1910			break;
1911		case BLEND_SOURCE:
1912			blendFactor.x = oC.x;
1913			blendFactor.y = oC.y;
1914			blendFactor.z = oC.z;
1915			break;
1916		case BLEND_INVSOURCE:
1917			blendFactor.x = Float4(1.0f) - oC.x;
1918			blendFactor.y = Float4(1.0f) - oC.y;
1919			blendFactor.z = Float4(1.0f) - oC.z;
1920			break;
1921		case BLEND_DEST:
1922			blendFactor.x = pixel.x;
1923			blendFactor.y = pixel.y;
1924			blendFactor.z = pixel.z;
1925			break;
1926		case BLEND_INVDEST:
1927			blendFactor.x = Float4(1.0f) - pixel.x;
1928			blendFactor.y = Float4(1.0f) - pixel.y;
1929			blendFactor.z = Float4(1.0f) - pixel.z;
1930			break;
1931		case BLEND_SOURCEALPHA:
1932			blendFactor.x = oC.w;
1933			blendFactor.y = oC.w;
1934			blendFactor.z = oC.w;
1935			break;
1936		case BLEND_INVSOURCEALPHA:
1937			blendFactor.x = Float4(1.0f) - oC.w;
1938			blendFactor.y = Float4(1.0f) - oC.w;
1939			blendFactor.z = Float4(1.0f) - oC.w;
1940			break;
1941		case BLEND_DESTALPHA:
1942			blendFactor.x = pixel.w;
1943			blendFactor.y = pixel.w;
1944			blendFactor.z = pixel.w;
1945			break;
1946		case BLEND_INVDESTALPHA:
1947			blendFactor.x = Float4(1.0f) - pixel.w;
1948			blendFactor.y = Float4(1.0f) - pixel.w;
1949			blendFactor.z = Float4(1.0f) - pixel.w;
1950			break;
1951		case BLEND_SRCALPHASAT:
1952			blendFactor.x = Float4(1.0f) - pixel.w;
1953			blendFactor.x = Min(blendFactor.x, oC.w);
1954			blendFactor.y = blendFactor.x;
1955			blendFactor.z = blendFactor.x;
1956			break;
1957		case BLEND_CONSTANT:
1958			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0]));
1959			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1]));
1960			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2]));
1961			break;
1962		case BLEND_INVCONSTANT:
1963			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
1964			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
1965			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
1966			break;
1967		default:
1968			ASSERT(false);
1969		}
1970	}
1971
1972	void PixelRoutine::blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive)
1973	{
1974		switch(blendFactorAlphaActive)
1975		{
1976		case BLEND_ZERO:
1977			// Optimized
1978			break;
1979		case BLEND_ONE:
1980			// Optimized
1981			break;
1982		case BLEND_SOURCE:
1983			blendFactor.w = oC.w;
1984			break;
1985		case BLEND_INVSOURCE:
1986			blendFactor.w = Float4(1.0f) - oC.w;
1987			break;
1988		case BLEND_DEST:
1989			blendFactor.w = pixel.w;
1990			break;
1991		case BLEND_INVDEST:
1992			blendFactor.w = Float4(1.0f) - pixel.w;
1993			break;
1994		case BLEND_SOURCEALPHA:
1995			blendFactor.w = oC.w;
1996			break;
1997		case BLEND_INVSOURCEALPHA:
1998			blendFactor.w = Float4(1.0f) - oC.w;
1999			break;
2000		case BLEND_DESTALPHA:
2001			blendFactor.w = pixel.w;
2002			break;
2003		case BLEND_INVDESTALPHA:
2004			blendFactor.w = Float4(1.0f) - pixel.w;
2005			break;
2006		case BLEND_SRCALPHASAT:
2007			blendFactor.w = Float4(1.0f);
2008			break;
2009		case BLEND_CONSTANT:
2010			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
2011			break;
2012		case BLEND_INVCONSTANT:
2013			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
2014			break;
2015		default:
2016			ASSERT(false);
2017		}
2018	}
2019
2020	void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
2021	{
2022		if(!state.alphaBlendActive)
2023		{
2024			return;
2025		}
2026
2027		Pointer<Byte> buffer;
2028		Vector4f pixel;
2029
2030		Vector4s color;
2031		Short4 c01;
2032		Short4 c23;
2033
2034		Float4 one;
2035		if(Surface::isFloatFormat(state.targetFormat[index]))
2036		{
2037			one = Float4(1.0f);
2038		}
2039		else if(Surface::isNonNormalizedInteger(state.targetFormat[index]))
2040		{
2041			one = As<Float4>(Surface::isUnsignedComponent(state.targetFormat[index], 0) ? Int4(0xFFFFFFFF) : Int4(0x7FFFFFFF));
2042		}
2043
2044		switch(state.targetFormat[index])
2045		{
2046		case FORMAT_R32I:
2047		case FORMAT_R32UI:
2048		case FORMAT_R32F:
2049			buffer = cBuffer;
2050			// FIXME: movlps
2051			pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
2052			pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
2053			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2054			// FIXME: movhps
2055			pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
2056			pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
2057			pixel.y = pixel.z = pixel.w = one;
2058			break;
2059		case FORMAT_G32R32I:
2060		case FORMAT_G32R32UI:
2061		case FORMAT_G32R32F:
2062			buffer = cBuffer;
2063			pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
2064			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2065			pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
2066			pixel.z = pixel.x;
2067			pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
2068			pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
2069			pixel.y = pixel.z;
2070			pixel.z = pixel.w = one;
2071			break;
2072		case FORMAT_X32B32G32R32F:
2073		case FORMAT_A32B32G32R32F:
2074		case FORMAT_A32B32G32R32I:
2075		case FORMAT_A32B32G32R32UI:
2076			buffer = cBuffer;
2077			pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
2078			pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2079			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2080			pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
2081			pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2082			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
2083			if(state.targetFormat[index] == FORMAT_X32B32G32R32F)
2084			{
2085				pixel.w = Float4(1.0f);
2086			}
2087			break;
2088		default:
2089			ASSERT(false);
2090		}
2091
2092		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
2093		{
2094			sRGBtoLinear(pixel.x);
2095			sRGBtoLinear(pixel.y);
2096			sRGBtoLinear(pixel.z);
2097		}
2098
2099		// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
2100		Vector4f sourceFactor;
2101		Vector4f destFactor;
2102
2103		blendFactor(sourceFactor, oC, pixel, state.sourceBlendFactor);
2104		blendFactor(destFactor, oC, pixel, state.destBlendFactor);
2105
2106		if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
2107		{
2108			oC.x *= sourceFactor.x;
2109			oC.y *= sourceFactor.y;
2110			oC.z *= sourceFactor.z;
2111		}
2112
2113		if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
2114		{
2115			pixel.x *= destFactor.x;
2116			pixel.y *= destFactor.y;
2117			pixel.z *= destFactor.z;
2118		}
2119
2120		switch(state.blendOperation)
2121		{
2122		case BLENDOP_ADD:
2123			oC.x += pixel.x;
2124			oC.y += pixel.y;
2125			oC.z += pixel.z;
2126			break;
2127		case BLENDOP_SUB:
2128			oC.x -= pixel.x;
2129			oC.y -= pixel.y;
2130			oC.z -= pixel.z;
2131			break;
2132		case BLENDOP_INVSUB:
2133			oC.x = pixel.x - oC.x;
2134			oC.y = pixel.y - oC.y;
2135			oC.z = pixel.z - oC.z;
2136			break;
2137		case BLENDOP_MIN:
2138			oC.x = Min(oC.x, pixel.x);
2139			oC.y = Min(oC.y, pixel.y);
2140			oC.z = Min(oC.z, pixel.z);
2141			break;
2142		case BLENDOP_MAX:
2143			oC.x = Max(oC.x, pixel.x);
2144			oC.y = Max(oC.y, pixel.y);
2145			oC.z = Max(oC.z, pixel.z);
2146			break;
2147		case BLENDOP_SOURCE:
2148			// No operation
2149			break;
2150		case BLENDOP_DEST:
2151			oC.x = pixel.x;
2152			oC.y = pixel.y;
2153			oC.z = pixel.z;
2154			break;
2155		case BLENDOP_NULL:
2156			oC.x = Float4(0.0f);
2157			oC.y = Float4(0.0f);
2158			oC.z = Float4(0.0f);
2159			break;
2160		default:
2161			ASSERT(false);
2162		}
2163
2164		blendFactorAlpha(sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
2165		blendFactorAlpha(destFactor, oC, pixel, state.destBlendFactorAlpha);
2166
2167		if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
2168		{
2169			oC.w *= sourceFactor.w;
2170		}
2171
2172		if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
2173		{
2174			pixel.w *= destFactor.w;
2175		}
2176
2177		switch(state.blendOperationAlpha)
2178		{
2179		case BLENDOP_ADD:
2180			oC.w += pixel.w;
2181			break;
2182		case BLENDOP_SUB:
2183			oC.w -= pixel.w;
2184			break;
2185		case BLENDOP_INVSUB:
2186			pixel.w -= oC.w;
2187			oC.w = pixel.w;
2188			break;
2189		case BLENDOP_MIN:
2190			oC.w = Min(oC.w, pixel.w);
2191			break;
2192		case BLENDOP_MAX:
2193			oC.w = Max(oC.w, pixel.w);
2194			break;
2195		case BLENDOP_SOURCE:
2196			// No operation
2197			break;
2198		case BLENDOP_DEST:
2199			oC.w = pixel.w;
2200			break;
2201		case BLENDOP_NULL:
2202			oC.w = Float4(0.0f);
2203			break;
2204		default:
2205			ASSERT(false);
2206		}
2207	}
2208
2209	void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
2210	{
2211		switch(state.targetFormat[index])
2212		{
2213		case FORMAT_R32F:
2214		case FORMAT_R32I:
2215		case FORMAT_R32UI:
2216		case FORMAT_R16I:
2217		case FORMAT_R16UI:
2218		case FORMAT_R8I:
2219		case FORMAT_R8UI:
2220			break;
2221		case FORMAT_G32R32F:
2222		case FORMAT_G32R32I:
2223		case FORMAT_G32R32UI:
2224		case FORMAT_G16R16I:
2225		case FORMAT_G16R16UI:
2226		case FORMAT_G8R8I:
2227		case FORMAT_G8R8UI:
2228			oC.z = oC.x;
2229			oC.x = UnpackLow(oC.x, oC.y);
2230			oC.z = UnpackHigh(oC.z, oC.y);
2231			oC.y = oC.z;
2232			break;
2233		case FORMAT_X32B32G32R32F:
2234		case FORMAT_A32B32G32R32F:
2235		case FORMAT_A32B32G32R32I:
2236		case FORMAT_A32B32G32R32UI:
2237		case FORMAT_A16B16G16R16I:
2238		case FORMAT_A16B16G16R16UI:
2239		case FORMAT_A8B8G8R8I:
2240		case FORMAT_A8B8G8R8UI:
2241			transpose4x4(oC.x, oC.y, oC.z, oC.w);
2242			break;
2243		default:
2244			ASSERT(false);
2245		}
2246
2247		int rgbaWriteMask = state.colorWriteActive(index);
2248
2249		Int xMask;   // Combination of all masks
2250
2251		if(state.depthTestActive)
2252		{
2253			xMask = zMask;
2254		}
2255		else
2256		{
2257			xMask = cMask;
2258		}
2259
2260		if(state.stencilActive)
2261		{
2262			xMask &= sMask;
2263		}
2264
2265		Pointer<Byte> buffer;
2266		Float4 value;
2267
2268		switch(state.targetFormat[index])
2269		{
2270		case FORMAT_R32F:
2271		case FORMAT_R32I:
2272		case FORMAT_R32UI:
2273			if(rgbaWriteMask & 0x00000001)
2274			{
2275				buffer = cBuffer + 4 * x;
2276
2277				// FIXME: movlps
2278				value.x = *Pointer<Float>(buffer + 0);
2279				value.y = *Pointer<Float>(buffer + 4);
2280
2281				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2282
2283				// FIXME: movhps
2284				value.z = *Pointer<Float>(buffer + 0);
2285				value.w = *Pointer<Float>(buffer + 4);
2286
2287				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
2288				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
2289				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2290
2291				// FIXME: movhps
2292				*Pointer<Float>(buffer + 0) = oC.x.z;
2293				*Pointer<Float>(buffer + 4) = oC.x.w;
2294
2295				buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2296
2297				// FIXME: movlps
2298				*Pointer<Float>(buffer + 0) = oC.x.x;
2299				*Pointer<Float>(buffer + 4) = oC.x.y;
2300			}
2301			break;
2302		case FORMAT_R16I:
2303		case FORMAT_R16UI:
2304			if(rgbaWriteMask & 0x00000001)
2305			{
2306				buffer = cBuffer + 2 * x;
2307
2308				UShort4 xyzw;
2309				xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
2310
2311				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2312
2313				xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
2314				value = As<Float4>(Int4(xyzw));
2315
2316				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2317				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2318				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2319
2320				if(state.targetFormat[index] == FORMAT_R16I)
2321				{
2322					Float component = oC.x.z;
2323					*Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2324					component = oC.x.w;
2325					*Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2326
2327					buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2328
2329					component = oC.x.x;
2330					*Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2331					component = oC.x.y;
2332					*Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2333				}
2334				else // FORMAT_R16UI
2335				{
2336					Float component = oC.x.z;
2337					*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2338					component = oC.x.w;
2339					*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2340
2341					buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2342
2343					component = oC.x.x;
2344					*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2345					component = oC.x.y;
2346					*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2347				}
2348			}
2349			break;
2350		case FORMAT_R8I:
2351		case FORMAT_R8UI:
2352			if(rgbaWriteMask & 0x00000001)
2353			{
2354				buffer = cBuffer + x;
2355
2356				UInt xyzw, packedCol;
2357
2358				xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
2359				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2360				xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
2361
2362				Short4 tmpCol = Short4(As<Int4>(oC.x));
2363				if(state.targetFormat[index] == FORMAT_R8I)
2364				{
2365					tmpCol = As<Short4>(Pack(tmpCol, tmpCol));
2366				}
2367				else
2368				{
2369					tmpCol = As<Short4>(Pack(As<UShort4>(tmpCol), As<UShort4>(tmpCol)));
2370				}
2371				packedCol = Extract(As<Int2>(tmpCol), 0);
2372
2373				packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
2374				            (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
2375
2376				*Pointer<UShort>(buffer) = UShort(packedCol >> 16);
2377				buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2378				*Pointer<UShort>(buffer) = UShort(packedCol);
2379			}
2380			break;
2381		case FORMAT_G32R32F:
2382		case FORMAT_G32R32I:
2383		case FORMAT_G32R32UI:
2384			buffer = cBuffer + 8 * x;
2385
2386			value = *Pointer<Float4>(buffer);
2387
2388			if((rgbaWriteMask & 0x00000003) != 0x00000003)
2389			{
2390				Float4 masked = value;
2391				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2392				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
2393				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2394			}
2395
2396			oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
2397			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
2398			oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2399			*Pointer<Float4>(buffer) = oC.x;
2400
2401			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2402
2403			value = *Pointer<Float4>(buffer);
2404
2405			if((rgbaWriteMask & 0x00000003) != 0x00000003)
2406			{
2407				Float4 masked;
2408
2409				masked = value;
2410				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2411				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
2412				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2413			}
2414
2415			oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
2416			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
2417			oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2418			*Pointer<Float4>(buffer) = oC.y;
2419			break;
2420		case FORMAT_G16R16I:
2421		case FORMAT_G16R16UI:
2422			if((rgbaWriteMask & 0x00000003) != 0x0)
2423			{
2424				buffer = cBuffer + 4 * x;
2425
2426				UInt2 rgbaMask;
2427				UShort4 packedCol = UShort4(As<Int4>(oC.x));
2428				UShort4 value = *Pointer<UShort4>(buffer);
2429				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2430				if((rgbaWriteMask & 0x3) != 0x3)
2431				{
2432					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2433					rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2434					mergedMask &= rgbaMask;
2435				}
2436				*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2437
2438				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2439
2440				packedCol = UShort4(As<Int4>(oC.y));
2441				value = *Pointer<UShort4>(buffer);
2442				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2443				if((rgbaWriteMask & 0x3) != 0x3)
2444				{
2445					mergedMask &= rgbaMask;
2446				}
2447				*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2448			}
2449			break;
2450		case FORMAT_G8R8I:
2451		case FORMAT_G8R8UI:
2452			if((rgbaWriteMask & 0x00000003) != 0x0)
2453			{
2454				buffer = cBuffer + 2 * x;
2455
2456				Int2 xyzw, packedCol;
2457
2458				xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
2459				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2460				xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
2461
2462				if(state.targetFormat[index] == FORMAT_G8R8I)
2463				{
2464					packedCol = As<Int2>(Pack(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2465				}
2466				else
2467				{
2468					packedCol = As<Int2>(Pack(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y))));
2469				}
2470
2471				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
2472				if((rgbaWriteMask & 0x3) != 0x3)
2473				{
2474					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
2475					UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2476					mergedMask &= rgbaMask;
2477				}
2478
2479				packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
2480
2481				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
2482				buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2483				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
2484			}
2485			break;
2486		case FORMAT_X32B32G32R32F:
2487		case FORMAT_A32B32G32R32F:
2488		case FORMAT_A32B32G32R32I:
2489		case FORMAT_A32B32G32R32UI:
2490			buffer = cBuffer + 16 * x;
2491
2492			{
2493				value = *Pointer<Float4>(buffer, 16);
2494
2495				if(rgbaWriteMask != 0x0000000F)
2496				{
2497					Float4 masked = value;
2498					oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2499					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2500					oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2501				}
2502
2503				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
2504				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
2505				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2506				*Pointer<Float4>(buffer, 16) = oC.x;
2507			}
2508
2509			{
2510				value = *Pointer<Float4>(buffer + 16, 16);
2511
2512				if(rgbaWriteMask != 0x0000000F)
2513				{
2514					Float4 masked = value;
2515					oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2516					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2517					oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2518				}
2519
2520				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
2521				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
2522				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2523				*Pointer<Float4>(buffer + 16, 16) = oC.y;
2524			}
2525
2526			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2527
2528			{
2529				value = *Pointer<Float4>(buffer, 16);
2530
2531				if(rgbaWriteMask != 0x0000000F)
2532				{
2533					Float4 masked = value;
2534					oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2535					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2536					oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
2537				}
2538
2539				oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
2540				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
2541				oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
2542				*Pointer<Float4>(buffer, 16) = oC.z;
2543			}
2544
2545			{
2546				value = *Pointer<Float4>(buffer + 16, 16);
2547
2548				if(rgbaWriteMask != 0x0000000F)
2549				{
2550					Float4 masked = value;
2551					oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2552					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2553					oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
2554				}
2555
2556				oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
2557				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
2558				oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
2559				*Pointer<Float4>(buffer + 16, 16) = oC.w;
2560			}
2561			break;
2562		case FORMAT_A16B16G16R16I:
2563		case FORMAT_A16B16G16R16UI:
2564			if((rgbaWriteMask & 0x0000000F) != 0x0)
2565			{
2566				buffer = cBuffer + 8 * x;
2567
2568				UInt4 rgbaMask;
2569				UShort8 value = *Pointer<UShort8>(buffer);
2570				UShort8 packedCol = UShort8(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y)));
2571				UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2572				if((rgbaWriteMask & 0xF) != 0xF)
2573				{
2574					UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
2575					rgbaMask = UInt4(tmpMask, tmpMask);
2576					mergedMask &= rgbaMask;
2577				}
2578				*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2579
2580				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2581
2582				value = *Pointer<UShort8>(buffer);
2583				packedCol = UShort8(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w)));
2584				mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2585				if((rgbaWriteMask & 0xF) != 0xF)
2586				{
2587					mergedMask &= rgbaMask;
2588				}
2589				*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2590			}
2591			break;
2592		case FORMAT_A8B8G8R8I:
2593		case FORMAT_A8B8G8R8UI:
2594			if((rgbaWriteMask & 0x0000000F) != 0x0)
2595			{
2596				UInt2 value, packedCol, mergedMask;
2597
2598				buffer = cBuffer + 4 * x;
2599
2600				if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
2601				{
2602					packedCol = As<UInt2>(Pack(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2603				}
2604				else
2605				{
2606					packedCol = As<UInt2>(Pack(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y))));
2607				}
2608				value = *Pointer<UInt2>(buffer, 16);
2609				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2610				if(rgbaWriteMask != 0xF)
2611				{
2612					mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2613				}
2614				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2615
2616				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2617
2618				if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
2619				{
2620					packedCol = As<UInt2>(Pack(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
2621				}
2622				else
2623				{
2624					packedCol = As<UInt2>(Pack(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w))));
2625				}
2626				value = *Pointer<UInt2>(buffer, 16);
2627				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2628				if(rgbaWriteMask != 0xF)
2629				{
2630					mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2631				}
2632				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2633			}
2634			break;
2635		default:
2636			ASSERT(false);
2637		}
2638	}
2639
2640	UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
2641	{
2642		return UShort4(cf * Float4(0xFFFF), saturate);
2643	}
2644
2645	void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
2646	{
2647		c.x = As<UShort4>(c.x) >> 4;
2648		c.y = As<UShort4>(c.y) >> 4;
2649		c.z = As<UShort4>(c.z) >> 4;
2650
2651		sRGBtoLinear12_16(c);
2652	}
2653
2654	void PixelRoutine::sRGBtoLinear12_16(Vector4s &c)
2655	{
2656		Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16);
2657
2658		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2659		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2660		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2661		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2662
2663		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2664		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2665		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2666		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2667
2668		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2669		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2670		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2671		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2672	}
2673
2674	void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
2675	{
2676		c.x = As<UShort4>(c.x) >> 4;
2677		c.y = As<UShort4>(c.y) >> 4;
2678		c.z = As<UShort4>(c.z) >> 4;
2679
2680		linearToSRGB12_16(c);
2681	}
2682
2683	void PixelRoutine::linearToSRGB12_16(Vector4s &c)
2684	{
2685		Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16);
2686
2687		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2688		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2689		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2690		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2691
2692		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2693		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2694		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2695		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2696
2697		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2698		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2699		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2700		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2701	}
2702
2703	Float4 PixelRoutine::sRGBtoLinear(const Float4 &x)   // Approximates x^2.2
2704	{
2705		Float4 linear = x * x;
2706		linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
2707
2708		return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
2709	}
2710
2711	bool PixelRoutine::colorUsed()
2712	{
2713		return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill;
2714	}
2715}
2716