1// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//    http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include "VertexRoutine.hpp"
16
17#include "VertexShader.hpp"
18#include "Constants.hpp"
19#include "Renderer/Vertex.hpp"
20#include "Renderer/Renderer.hpp"
21#include "Common/Half.hpp"
22#include "Common/Debug.hpp"
23
24namespace sw
25{
26	extern bool halfIntegerCoordinates;     // Pixel centers are not at integer coordinates
27	extern bool symmetricNormalizedDepth;   // [-1, 1] instead of [0, 1]
28
29	VertexRoutine::VertexRoutine(const VertexProcessor::State &state, const VertexShader *shader)
30		: v(shader && shader->dynamicallyIndexedInput),
31		  o(shader && shader->dynamicallyIndexedOutput),
32		  state(state)
33	{
34	}
35
36	VertexRoutine::~VertexRoutine()
37	{
38	}
39
40	void VertexRoutine::generate()
41	{
42		const bool textureSampling = state.textureSampling;
43
44		Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache);
45		Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex);
46		Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag);
47
48		UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount));
49		UInt primitiveNumber = *Pointer<UInt>(task + OFFSET(VertexTask, primitiveStart));
50		UInt indexInPrimitive = 0;
51
52		constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
53
54		Do
55		{
56			UInt index = *Pointer<UInt>(batch);
57			UInt tagIndex = index & 0x0000003C;
58			UInt indexQ = !textureSampling ? UInt(index & 0xFFFFFFFC) : index;   // FIXME: TEXLDL hack to have independent LODs, hurts performance.
59
60			If(*Pointer<UInt>(tagCache + tagIndex) != indexQ)
61			{
62				*Pointer<UInt>(tagCache + tagIndex) = indexQ;
63
64				readInput(indexQ);
65				pipeline(indexQ);
66				postTransform();
67				computeClipFlags();
68
69				Pointer<Byte> cacheLine0 = vertexCache + tagIndex * UInt((int)sizeof(Vertex));
70				writeCache(cacheLine0);
71			}
72
73			UInt cacheIndex = index & 0x0000003F;
74			Pointer<Byte> cacheLine = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
75			writeVertex(vertex, cacheLine);
76
77			if(state.transformFeedbackEnabled != 0)
78			{
79				transformFeedback(vertex, primitiveNumber, indexInPrimitive);
80
81				indexInPrimitive++;
82				If(indexInPrimitive == 3)
83				{
84					primitiveNumber++;
85					indexInPrimitive = 0;
86				}
87			}
88
89			vertex += sizeof(Vertex);
90			batch += sizeof(unsigned int);
91			vertexCount--;
92		}
93		Until(vertexCount == 0)
94
95		Return();
96	}
97
98	void VertexRoutine::readInput(UInt &index)
99	{
100		for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
101		{
102			Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,input) + sizeof(void*) * i);
103			UInt stride = *Pointer<UInt>(data + OFFSET(DrawData,stride) + sizeof(unsigned int) * i);
104
105			v[i] = readStream(input, stride, state.input[i], index);
106		}
107	}
108
109	void VertexRoutine::computeClipFlags()
110	{
111		int pos = state.positionRegister;
112
113		Int4 maxX = CmpLT(o[pos].w, o[pos].x);
114		Int4 maxY = CmpLT(o[pos].w, o[pos].y);
115		Int4 maxZ = CmpLT(o[pos].w, o[pos].z);
116		Int4 minX = CmpNLE(-o[pos].w, o[pos].x);
117		Int4 minY = CmpNLE(-o[pos].w, o[pos].y);
118		Int4 minZ = symmetricNormalizedDepth ? CmpNLE(-o[pos].w, o[pos].z) : CmpNLE(Float4(0.0f), o[pos].z);
119
120		clipFlags = *Pointer<Int>(constants + OFFSET(Constants,maxX) + SignMask(maxX) * 4);   // FIXME: Array indexing
121		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxY) + SignMask(maxY) * 4);
122		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxZ) + SignMask(maxZ) * 4);
123		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minX) + SignMask(minX) * 4);
124		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minY) + SignMask(minY) * 4);
125		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minZ) + SignMask(minZ) * 4);
126
127		Int4 finiteX = CmpLE(Abs(o[pos].x), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
128		Int4 finiteY = CmpLE(Abs(o[pos].y), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
129		Int4 finiteZ = CmpLE(Abs(o[pos].z), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
130
131		Int4 finiteXYZ = finiteX & finiteY & finiteZ;
132		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,fini) + SignMask(finiteXYZ) * 4);
133
134		if(state.preTransformed)
135		{
136			clipFlags &= 0xFBFBFBFB;   // Don't clip against far clip plane
137		}
138	}
139
140	Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index)
141	{
142		const bool textureSampling = state.textureSampling;
143
144		Vector4f v;
145
146		Pointer<Byte> source0 = buffer + index * stride;
147		Pointer<Byte> source1 = source0 + (!textureSampling ? stride : 0);
148		Pointer<Byte> source2 = source1 + (!textureSampling ? stride : 0);
149		Pointer<Byte> source3 = source2 + (!textureSampling ? stride : 0);
150
151		bool isNativeFloatAttrib = (stream.attribType == VertexShader::ATTRIBTYPE_FLOAT) || stream.normalized;
152
153		switch(stream.type)
154		{
155		case STREAMTYPE_FLOAT:
156			{
157				if(stream.count == 0)
158				{
159					// Null stream, all default components
160				}
161				else
162				{
163					if(stream.count == 1)
164					{
165						v.x.x = *Pointer<Float>(source0);
166						v.x.y = *Pointer<Float>(source1);
167						v.x.z = *Pointer<Float>(source2);
168						v.x.w = *Pointer<Float>(source3);
169					}
170					else
171					{
172						v.x = *Pointer<Float4>(source0);
173						v.y = *Pointer<Float4>(source1);
174						v.z = *Pointer<Float4>(source2);
175						v.w = *Pointer<Float4>(source3);
176
177						transpose4xN(v.x, v.y, v.z, v.w, stream.count);
178					}
179
180					switch(stream.attribType)
181					{
182					case VertexShader::ATTRIBTYPE_INT:
183						if(stream.count >= 1) v.x = As<Float4>(Int4(v.x));
184						if(stream.count >= 2) v.x = As<Float4>(Int4(v.y));
185						if(stream.count >= 3) v.x = As<Float4>(Int4(v.z));
186						if(stream.count >= 4) v.x = As<Float4>(Int4(v.w));
187						break;
188					case VertexShader::ATTRIBTYPE_UINT:
189						if(stream.count >= 1) v.x = As<Float4>(UInt4(v.x));
190						if(stream.count >= 2) v.x = As<Float4>(UInt4(v.y));
191						if(stream.count >= 3) v.x = As<Float4>(UInt4(v.z));
192						if(stream.count >= 4) v.x = As<Float4>(UInt4(v.w));
193						break;
194					default:
195						break;
196					}
197				}
198			}
199			break;
200		case STREAMTYPE_BYTE:
201			if(isNativeFloatAttrib) // Stream: UByte, Shader attrib: Float
202			{
203				v.x = Float4(*Pointer<Byte4>(source0));
204				v.y = Float4(*Pointer<Byte4>(source1));
205				v.z = Float4(*Pointer<Byte4>(source2));
206				v.w = Float4(*Pointer<Byte4>(source3));
207
208				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
209
210				if(stream.normalized)
211				{
212					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
213					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
214					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
215					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
216				}
217			}
218			else // Stream: UByte, Shader attrib: Int / UInt
219			{
220				v.x = As<Float4>(Int4(*Pointer<Byte4>(source0)));
221				v.y = As<Float4>(Int4(*Pointer<Byte4>(source1)));
222				v.z = As<Float4>(Int4(*Pointer<Byte4>(source2)));
223				v.w = As<Float4>(Int4(*Pointer<Byte4>(source3)));
224
225				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
226			}
227			break;
228		case STREAMTYPE_SBYTE:
229			if(isNativeFloatAttrib) // Stream: SByte, Shader attrib: Float
230			{
231				v.x = Float4(*Pointer<SByte4>(source0));
232				v.y = Float4(*Pointer<SByte4>(source1));
233				v.z = Float4(*Pointer<SByte4>(source2));
234				v.w = Float4(*Pointer<SByte4>(source3));
235
236				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
237
238				if(stream.normalized)
239				{
240					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
241					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
242					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
243					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
244				}
245			}
246			else // Stream: SByte, Shader attrib: Int / UInt
247			{
248				v.x = As<Float4>(Int4(*Pointer<SByte4>(source0)));
249				v.y = As<Float4>(Int4(*Pointer<SByte4>(source1)));
250				v.z = As<Float4>(Int4(*Pointer<SByte4>(source2)));
251				v.w = As<Float4>(Int4(*Pointer<SByte4>(source3)));
252
253				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
254			}
255			break;
256		case STREAMTYPE_COLOR:
257			{
258				v.x = Float4(*Pointer<Byte4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
259				v.y = Float4(*Pointer<Byte4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
260				v.z = Float4(*Pointer<Byte4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
261				v.w = Float4(*Pointer<Byte4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
262
263				transpose4x4(v.x, v.y, v.z, v.w);
264
265				// Swap red and blue
266				Float4 t = v.x;
267				v.x = v.z;
268				v.z = t;
269			}
270			break;
271		case STREAMTYPE_SHORT:
272			if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
273			{
274				v.x = Float4(*Pointer<Short4>(source0));
275				v.y = Float4(*Pointer<Short4>(source1));
276				v.z = Float4(*Pointer<Short4>(source2));
277				v.w = Float4(*Pointer<Short4>(source3));
278
279				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
280
281				if(stream.normalized)
282				{
283					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
284					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
285					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
286					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
287				}
288			}
289			else // Stream: Short, Shader attrib: Int/UInt, no type conversion
290			{
291				v.x = As<Float4>(Int4(*Pointer<Short4>(source0)));
292				v.y = As<Float4>(Int4(*Pointer<Short4>(source1)));
293				v.z = As<Float4>(Int4(*Pointer<Short4>(source2)));
294				v.w = As<Float4>(Int4(*Pointer<Short4>(source3)));
295
296				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
297			}
298			break;
299		case STREAMTYPE_USHORT:
300			if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
301			{
302				v.x = Float4(*Pointer<UShort4>(source0));
303				v.y = Float4(*Pointer<UShort4>(source1));
304				v.z = Float4(*Pointer<UShort4>(source2));
305				v.w = Float4(*Pointer<UShort4>(source3));
306
307				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
308
309				if(stream.normalized)
310				{
311					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
312					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
313					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
314					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
315				}
316			}
317			else // Stream: UShort, Shader attrib: Int/UInt, no type conversion
318			{
319				v.x = As<Float4>(Int4(*Pointer<UShort4>(source0)));
320				v.y = As<Float4>(Int4(*Pointer<UShort4>(source1)));
321				v.z = As<Float4>(Int4(*Pointer<UShort4>(source2)));
322				v.w = As<Float4>(Int4(*Pointer<UShort4>(source3)));
323
324				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
325			}
326			break;
327		case STREAMTYPE_INT:
328			if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
329			{
330				v.x = Float4(*Pointer<Int4>(source0));
331				v.y = Float4(*Pointer<Int4>(source1));
332				v.z = Float4(*Pointer<Int4>(source2));
333				v.w = Float4(*Pointer<Int4>(source3));
334
335				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
336
337				if(stream.normalized)
338				{
339					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
340					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
341					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
342					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
343				}
344			}
345			else // Stream: Int, Shader attrib: Int/UInt, no type conversion
346			{
347				v.x = *Pointer<Float4>(source0);
348				v.y = *Pointer<Float4>(source1);
349				v.z = *Pointer<Float4>(source2);
350				v.w = *Pointer<Float4>(source3);
351
352				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
353			}
354			break;
355		case STREAMTYPE_UINT:
356			if(isNativeFloatAttrib) // Stream: UInt, Shader attrib: Float
357			{
358				v.x = Float4(*Pointer<UInt4>(source0));
359				v.y = Float4(*Pointer<UInt4>(source1));
360				v.z = Float4(*Pointer<UInt4>(source2));
361				v.w = Float4(*Pointer<UInt4>(source3));
362
363				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
364
365				if(stream.normalized)
366				{
367					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
368					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
369					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
370					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
371				}
372			}
373			else // Stream: UInt, Shader attrib: Int/UInt, no type conversion
374			{
375				v.x = *Pointer<Float4>(source0);
376				v.y = *Pointer<Float4>(source1);
377				v.z = *Pointer<Float4>(source2);
378				v.w = *Pointer<Float4>(source3);
379
380				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
381			}
382			break;
383		case STREAMTYPE_UDEC3:
384			{
385				// FIXME: Vectorize
386				{
387					Int x, y, z;
388
389					x = y = z = *Pointer<Int>(source0);
390
391					v.x.x = Float(x & 0x000003FF);
392					v.x.y = Float(y & 0x000FFC00);
393					v.x.z = Float(z & 0x3FF00000);
394				}
395
396				{
397					Int x, y, z;
398
399					x = y = z = *Pointer<Int>(source1);
400
401					v.y.x = Float(x & 0x000003FF);
402					v.y.y = Float(y & 0x000FFC00);
403					v.y.z = Float(z & 0x3FF00000);
404				}
405
406				{
407					Int x, y, z;
408
409					x = y = z = *Pointer<Int>(source2);
410
411					v.z.x = Float(x & 0x000003FF);
412					v.z.y = Float(y & 0x000FFC00);
413					v.z.z = Float(z & 0x3FF00000);
414				}
415
416				{
417					Int x, y, z;
418
419					x = y = z = *Pointer<Int>(source3);
420
421					v.w.x = Float(x & 0x000003FF);
422					v.w.y = Float(y & 0x000FFC00);
423					v.w.z = Float(z & 0x3FF00000);
424				}
425
426				transpose4x3(v.x, v.y, v.z, v.w);
427
428				v.y *= Float4(1.0f / 0x00000400);
429				v.z *= Float4(1.0f / 0x00100000);
430			}
431			break;
432		case STREAMTYPE_DEC3N:
433			{
434				// FIXME: Vectorize
435				{
436					Int x, y, z;
437
438					x = y = z = *Pointer<Int>(source0);
439
440					v.x.x = Float((x << 22) & 0xFFC00000);
441					v.x.y = Float((y << 12) & 0xFFC00000);
442					v.x.z = Float((z << 2)  & 0xFFC00000);
443				}
444
445				{
446					Int x, y, z;
447
448					x = y = z = *Pointer<Int>(source1);
449
450					v.y.x = Float((x << 22) & 0xFFC00000);
451					v.y.y = Float((y << 12) & 0xFFC00000);
452					v.y.z = Float((z << 2)  & 0xFFC00000);
453				}
454
455				{
456					Int x, y, z;
457
458					x = y = z = *Pointer<Int>(source2);
459
460					v.z.x = Float((x << 22) & 0xFFC00000);
461					v.z.y = Float((y << 12) & 0xFFC00000);
462					v.z.z = Float((z << 2)  & 0xFFC00000);
463				}
464
465				{
466					Int x, y, z;
467
468					x = y = z = *Pointer<Int>(source3);
469
470					v.w.x = Float((x << 22) & 0xFFC00000);
471					v.w.y = Float((y << 12) & 0xFFC00000);
472					v.w.z = Float((z << 2)  & 0xFFC00000);
473				}
474
475				transpose4x3(v.x, v.y, v.z, v.w);
476
477				v.x *= Float4(1.0f / 0x00400000 / 511.0f);
478				v.y *= Float4(1.0f / 0x00400000 / 511.0f);
479				v.z *= Float4(1.0f / 0x00400000 / 511.0f);
480			}
481			break;
482		case STREAMTYPE_FIXED:
483			{
484				v.x = Float4(*Pointer<Int4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
485				v.y = Float4(*Pointer<Int4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
486				v.z = Float4(*Pointer<Int4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
487				v.w = Float4(*Pointer<Int4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
488
489				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
490			}
491			break;
492		case STREAMTYPE_HALF:
493			{
494				if(stream.count >= 1)
495				{
496					UShort x0 = *Pointer<UShort>(source0 + 0);
497					UShort x1 = *Pointer<UShort>(source1 + 0);
498					UShort x2 = *Pointer<UShort>(source2 + 0);
499					UShort x3 = *Pointer<UShort>(source3 + 0);
500
501					v.x.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x0) * 4);
502					v.x.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x1) * 4);
503					v.x.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x2) * 4);
504					v.x.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x3) * 4);
505				}
506
507				if(stream.count >= 2)
508				{
509					UShort y0 = *Pointer<UShort>(source0 + 2);
510					UShort y1 = *Pointer<UShort>(source1 + 2);
511					UShort y2 = *Pointer<UShort>(source2 + 2);
512					UShort y3 = *Pointer<UShort>(source3 + 2);
513
514					v.y.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y0) * 4);
515					v.y.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y1) * 4);
516					v.y.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y2) * 4);
517					v.y.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y3) * 4);
518				}
519
520				if(stream.count >= 3)
521				{
522					UShort z0 = *Pointer<UShort>(source0 + 4);
523					UShort z1 = *Pointer<UShort>(source1 + 4);
524					UShort z2 = *Pointer<UShort>(source2 + 4);
525					UShort z3 = *Pointer<UShort>(source3 + 4);
526
527					v.z.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z0) * 4);
528					v.z.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z1) * 4);
529					v.z.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z2) * 4);
530					v.z.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z3) * 4);
531				}
532
533				if(stream.count >= 4)
534				{
535					UShort w0 = *Pointer<UShort>(source0 + 6);
536					UShort w1 = *Pointer<UShort>(source1 + 6);
537					UShort w2 = *Pointer<UShort>(source2 + 6);
538					UShort w3 = *Pointer<UShort>(source3 + 6);
539
540					v.w.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w0) * 4);
541					v.w.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w1) * 4);
542					v.w.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w2) * 4);
543					v.w.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w3) * 4);
544				}
545			}
546			break;
547		case STREAMTYPE_INDICES:
548			{
549				v.x.x = *Pointer<Float>(source0);
550				v.x.y = *Pointer<Float>(source1);
551				v.x.z = *Pointer<Float>(source2);
552				v.x.w = *Pointer<Float>(source3);
553			}
554			break;
555		case STREAMTYPE_2_10_10_10_INT:
556			{
557				Int4 src;
558				src = Insert(src, *Pointer<Int>(source0), 0);
559				src = Insert(src, *Pointer<Int>(source1), 1);
560				src = Insert(src, *Pointer<Int>(source2), 2);
561				src = Insert(src, *Pointer<Int>(source3), 3);
562
563				v.x = Float4((src << 22) >> 22);
564				v.y = Float4((src << 12) >> 22);
565				v.z = Float4((src << 02) >> 22);
566				v.w = Float4(src >> 30);
567
568				if(stream.normalized)
569				{
570					v.x = Max(v.x * Float4(1.0f / 0x1FF), Float4(-1.0f));
571					v.y = Max(v.y * Float4(1.0f / 0x1FF), Float4(-1.0f));
572					v.z = Max(v.z * Float4(1.0f / 0x1FF), Float4(-1.0f));
573					v.w = Max(v.w, Float4(-1.0f));
574				}
575			}
576			break;
577		case STREAMTYPE_2_10_10_10_UINT:
578			{
579				Int4 src;
580				src = Insert(src, *Pointer<Int>(source0), 0);
581				src = Insert(src, *Pointer<Int>(source1), 1);
582				src = Insert(src, *Pointer<Int>(source2), 2);
583				src = Insert(src, *Pointer<Int>(source3), 3);
584
585				v.x = Float4(src & Int4(0x3FF));
586				v.y = Float4((src >> 10) & Int4(0x3FF));
587				v.z = Float4((src >> 20) & Int4(0x3FF));
588				v.w = Float4((src >> 30) & Int4(0x3));
589
590				if(stream.normalized)
591				{
592					v.x *= Float4(1.0f / 0x3FF);
593					v.y *= Float4(1.0f / 0x3FF);
594					v.z *= Float4(1.0f / 0x3FF);
595					v.w *= Float4(1.0f / 0x3);
596				}
597			}
598			break;
599		default:
600			ASSERT(false);
601		}
602
603		if(stream.count < 1) v.x = Float4(0.0f);
604		if(stream.count < 2) v.y = Float4(0.0f);
605		if(stream.count < 3) v.z = Float4(0.0f);
606		if(stream.count < 4) v.w = isNativeFloatAttrib ? As<Float4>(Float4(1.0f)) : As<Float4>(Int4(0));
607
608		return v;
609	}
610
611	void VertexRoutine::postTransform()
612	{
613		int pos = state.positionRegister;
614
615		// Backtransform
616		if(state.preTransformed)
617		{
618			Float4 rhw = Float4(1.0f) / o[pos].w;
619
620			Float4 W = *Pointer<Float4>(data + OFFSET(DrawData,Wx16)) * Float4(1.0f / 16.0f);
621			Float4 H = *Pointer<Float4>(data + OFFSET(DrawData,Hx16)) * Float4(1.0f / 16.0f);
622			Float4 L = *Pointer<Float4>(data + OFFSET(DrawData,X0x16)) * Float4(1.0f / 16.0f);
623			Float4 T = *Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) * Float4(1.0f / 16.0f);
624
625			o[pos].x = (o[pos].x - L) / W * rhw;
626			o[pos].y = (o[pos].y - T) / H * rhw;
627			o[pos].z = o[pos].z * rhw;
628			o[pos].w = rhw;
629		}
630
631		if(!halfIntegerCoordinates && !state.preTransformed)
632		{
633			o[pos].x = o[pos].x + *Pointer<Float4>(data + OFFSET(DrawData,halfPixelX)) * o[pos].w;
634			o[pos].y = o[pos].y + *Pointer<Float4>(data + OFFSET(DrawData,halfPixelY)) * o[pos].w;
635		}
636
637		if(state.superSampling)
638		{
639			o[pos].x = o[pos].x + *Pointer<Float4>(data + OFFSET(DrawData,XXXX)) * o[pos].w;
640			o[pos].y = o[pos].y + *Pointer<Float4>(data + OFFSET(DrawData,YYYY)) * o[pos].w;
641		}
642	}
643
644	void VertexRoutine::writeCache(Pointer<Byte> &cacheLine)
645	{
646		Vector4f v;
647
648		for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
649		{
650			if(state.output[i].write)
651			{
652				v.x = o[i].x;
653				v.y = o[i].y;
654				v.z = o[i].z;
655				v.w = o[i].w;
656
657				if(state.output[i].xClamp)
658				{
659					v.x = Max(v.x, Float4(0.0f));
660					v.x = Min(v.x, Float4(1.0f));
661				}
662
663				if(state.output[i].yClamp)
664				{
665					v.y = Max(v.y, Float4(0.0f));
666					v.y = Min(v.y, Float4(1.0f));
667				}
668
669				if(state.output[i].zClamp)
670				{
671					v.z = Max(v.z, Float4(0.0f));
672					v.z = Min(v.z, Float4(1.0f));
673				}
674
675				if(state.output[i].wClamp)
676				{
677					v.w = Max(v.w, Float4(0.0f));
678					v.w = Min(v.w, Float4(1.0f));
679				}
680
681				if(state.output[i].write == 0x01)
682				{
683					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0) = v.x.x;
684					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1) = v.x.y;
685					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2) = v.x.z;
686					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3) = v.x.w;
687				}
688				else
689				{
690					if(state.output[i].write == 0x03)
691					{
692						transpose2x4(v.x, v.y, v.z, v.w);
693					}
694					else
695					{
696						transpose4x4(v.x, v.y, v.z, v.w);
697					}
698
699					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0, 16) = v.x;
700					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1, 16) = v.y;
701					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2, 16) = v.z;
702					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3, 16) = v.w;
703				}
704			}
705		}
706
707		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 0) = (clipFlags >> 0)  & 0x0000000FF;
708		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 1) = (clipFlags >> 8)  & 0x0000000FF;
709		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 2) = (clipFlags >> 16) & 0x0000000FF;
710		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 3) = (clipFlags >> 24) & 0x0000000FF;
711
712		// Viewport transform
713		int pos = state.positionRegister;
714
715		v.x = o[pos].x;
716		v.y = o[pos].y;
717		v.z = o[pos].z;
718		v.w = o[pos].w;
719
720		if(symmetricNormalizedDepth)
721		{
722			v.z = (v.z + v.w) * Float4(0.5f);   // [-1, 1] -> [0, 1]
723		}
724
725		Float4 w = As<Float4>(As<Int4>(v.w) | (As<Int4>(CmpEQ(v.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
726		Float4 rhw = Float4(1.0f) / w;
727
728		v.x = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,X0x16)) + v.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Wx16))));
729		v.y = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) + v.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Hx16))));
730		v.z = v.z * rhw;
731		v.w = rhw;
732
733		transpose4x4(v.x, v.y, v.z, v.w);
734
735		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 0, 16) = v.x;
736		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 1, 16) = v.y;
737		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 2, 16) = v.z;
738		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 3, 16) = v.w;
739	}
740
741	void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cache)
742	{
743		for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
744		{
745			if(state.output[i].write)
746			{
747				*Pointer<Int4>(vertex + OFFSET(Vertex,v[i]), 16) = *Pointer<Int4>(cache + OFFSET(Vertex,v[i]), 16);
748			}
749		}
750
751		*Pointer<Int4>(vertex + OFFSET(Vertex,X)) = *Pointer<Int4>(cache + OFFSET(Vertex,X));
752		*Pointer<Int>(vertex + OFFSET(Vertex,clipFlags)) = *Pointer<Int>(cache + OFFSET(Vertex,clipFlags));
753	}
754
755	void VertexRoutine::transformFeedback(const Pointer<Byte> &vertex, const UInt &primitiveNumber, const UInt &indexInPrimitive)
756	{
757		If(indexInPrimitive < state.verticesPerPrimitive)
758		{
759			UInt tOffset = primitiveNumber * state.verticesPerPrimitive + indexInPrimitive;
760
761			for(int i = 0; i < MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS; i++)
762			{
763				if(state.transformFeedbackEnabled & (1ULL << i))
764				{
765					UInt reg = *Pointer<UInt>(data + OFFSET(DrawData, vs.reg[i]));
766					UInt row = *Pointer<UInt>(data + OFFSET(DrawData, vs.row[i]));
767					UInt col = *Pointer<UInt>(data + OFFSET(DrawData, vs.col[i]));
768					UInt str = *Pointer<UInt>(data + OFFSET(DrawData, vs.str[i]));
769
770					Pointer<Byte> t = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, vs.t[i])) + (tOffset * str * sizeof(float));
771					Pointer<Byte> v = vertex + OFFSET(Vertex, v) + reg * sizeof(float);
772
773					For(UInt r = 0, r < row, r++)
774					{
775						UInt rOffsetX = r * col * sizeof(float);
776						UInt rOffset4 = r * sizeof(float4);
777
778						For(UInt c = 0, c < col, c++)
779						{
780							UInt cOffset = c * sizeof(float);
781							*Pointer<Float>(t + rOffsetX + cOffset) = *Pointer<Float>(v + rOffset4 + cOffset);
782						}
783					}
784				}
785			}
786		}
787	}
788}
789