tgsi_exec.c revision c0d941877b410b2402ed853d1d33b0664a3d1445
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29/**
30 * TGSI interpreter/executor.
31 *
32 * Flow control information:
33 *
34 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
35 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
36 * care since a condition may be true for some quad components but false
37 * for other components.
38 *
39 * We basically execute all statements (even if they're in the part of
40 * an IF/ELSE clause that's "not taken") and use a special mask to
41 * control writing to destination registers.  This is the ExecMask.
42 * See store_dest().
43 *
44 * The ExecMask is computed from three other masks (CondMask, LoopMask and
45 * ContMask) which are controlled by the flow control instructions (namely:
46 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
47 *
48 *
49 * Authors:
50 *   Michal Krol
51 *   Brian Paul
52 */
53
54#include "pipe/p_compiler.h"
55#include "pipe/p_state.h"
56#include "pipe/p_shader_tokens.h"
57#include "tgsi/tgsi_dump.h"
58#include "tgsi/tgsi_parse.h"
59#include "tgsi/tgsi_util.h"
60#include "tgsi_exec.h"
61#include "util/u_memory.h"
62#include "util/u_math.h"
63
64
65#define FAST_MATH 1
66
67#define TILE_TOP_LEFT     0
68#define TILE_TOP_RIGHT    1
69#define TILE_BOTTOM_LEFT  2
70#define TILE_BOTTOM_RIGHT 3
71
72static void
73micro_abs(union tgsi_exec_channel *dst,
74          const union tgsi_exec_channel *src)
75{
76   dst->f[0] = fabsf(src->f[0]);
77   dst->f[1] = fabsf(src->f[1]);
78   dst->f[2] = fabsf(src->f[2]);
79   dst->f[3] = fabsf(src->f[3]);
80}
81
82static void
83micro_arl(union tgsi_exec_channel *dst,
84          const union tgsi_exec_channel *src)
85{
86   dst->i[0] = (int)floorf(src->f[0]);
87   dst->i[1] = (int)floorf(src->f[1]);
88   dst->i[2] = (int)floorf(src->f[2]);
89   dst->i[3] = (int)floorf(src->f[3]);
90}
91
92static void
93micro_arr(union tgsi_exec_channel *dst,
94          const union tgsi_exec_channel *src)
95{
96   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
97   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
98   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
99   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
100}
101
102static void
103micro_ceil(union tgsi_exec_channel *dst,
104           const union tgsi_exec_channel *src)
105{
106   dst->f[0] = ceilf(src->f[0]);
107   dst->f[1] = ceilf(src->f[1]);
108   dst->f[2] = ceilf(src->f[2]);
109   dst->f[3] = ceilf(src->f[3]);
110}
111
112static void
113micro_clamp(union tgsi_exec_channel *dst,
114            const union tgsi_exec_channel *src0,
115            const union tgsi_exec_channel *src1,
116            const union tgsi_exec_channel *src2)
117{
118   dst->f[0] = src0->f[0] < src1->f[0] ? src1->f[0] : src0->f[0] > src2->f[0] ? src2->f[0] : src0->f[0];
119   dst->f[1] = src0->f[1] < src1->f[1] ? src1->f[1] : src0->f[1] > src2->f[1] ? src2->f[1] : src0->f[1];
120   dst->f[2] = src0->f[2] < src1->f[2] ? src1->f[2] : src0->f[2] > src2->f[2] ? src2->f[2] : src0->f[2];
121   dst->f[3] = src0->f[3] < src1->f[3] ? src1->f[3] : src0->f[3] > src2->f[3] ? src2->f[3] : src0->f[3];
122}
123
124static void
125micro_cmp(union tgsi_exec_channel *dst,
126          const union tgsi_exec_channel *src0,
127          const union tgsi_exec_channel *src1,
128          const union tgsi_exec_channel *src2)
129{
130   dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
131   dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
132   dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
133   dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
134}
135
136static void
137micro_cnd(union tgsi_exec_channel *dst,
138          const union tgsi_exec_channel *src0,
139          const union tgsi_exec_channel *src1,
140          const union tgsi_exec_channel *src2)
141{
142   dst->f[0] = src2->f[0] > 0.5f ? src0->f[0] : src1->f[0];
143   dst->f[1] = src2->f[1] > 0.5f ? src0->f[1] : src1->f[1];
144   dst->f[2] = src2->f[2] > 0.5f ? src0->f[2] : src1->f[2];
145   dst->f[3] = src2->f[3] > 0.5f ? src0->f[3] : src1->f[3];
146}
147
148static void
149micro_cos(union tgsi_exec_channel *dst,
150          const union tgsi_exec_channel *src)
151{
152   dst->f[0] = cosf(src->f[0]);
153   dst->f[1] = cosf(src->f[1]);
154   dst->f[2] = cosf(src->f[2]);
155   dst->f[3] = cosf(src->f[3]);
156}
157
158static void
159micro_ddx(union tgsi_exec_channel *dst,
160          const union tgsi_exec_channel *src)
161{
162   dst->f[0] =
163   dst->f[1] =
164   dst->f[2] =
165   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
166}
167
168static void
169micro_ddy(union tgsi_exec_channel *dst,
170          const union tgsi_exec_channel *src)
171{
172   dst->f[0] =
173   dst->f[1] =
174   dst->f[2] =
175   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
176}
177
178static void
179micro_exp2(union tgsi_exec_channel *dst,
180           const union tgsi_exec_channel *src)
181{
182#if FAST_MATH
183   dst->f[0] = util_fast_exp2(src->f[0]);
184   dst->f[1] = util_fast_exp2(src->f[1]);
185   dst->f[2] = util_fast_exp2(src->f[2]);
186   dst->f[3] = util_fast_exp2(src->f[3]);
187#else
188#if DEBUG
189   /* Inf is okay for this instruction, so clamp it to silence assertions. */
190   uint i;
191   union tgsi_exec_channel clamped;
192
193   for (i = 0; i < 4; i++) {
194      if (src->f[i] > 127.99999f) {
195         clamped.f[i] = 127.99999f;
196      } else if (src->f[i] < -126.99999f) {
197         clamped.f[i] = -126.99999f;
198      } else {
199         clamped.f[i] = src->f[i];
200      }
201   }
202   src = &clamped;
203#endif /* DEBUG */
204
205   dst->f[0] = powf(2.0f, src->f[0]);
206   dst->f[1] = powf(2.0f, src->f[1]);
207   dst->f[2] = powf(2.0f, src->f[2]);
208   dst->f[3] = powf(2.0f, src->f[3]);
209#endif /* FAST_MATH */
210}
211
212static void
213micro_flr(union tgsi_exec_channel *dst,
214          const union tgsi_exec_channel *src)
215{
216   dst->f[0] = floorf(src->f[0]);
217   dst->f[1] = floorf(src->f[1]);
218   dst->f[2] = floorf(src->f[2]);
219   dst->f[3] = floorf(src->f[3]);
220}
221
222static void
223micro_frc(union tgsi_exec_channel *dst,
224          const union tgsi_exec_channel *src)
225{
226   dst->f[0] = src->f[0] - floorf(src->f[0]);
227   dst->f[1] = src->f[1] - floorf(src->f[1]);
228   dst->f[2] = src->f[2] - floorf(src->f[2]);
229   dst->f[3] = src->f[3] - floorf(src->f[3]);
230}
231
232static void
233micro_iabs(union tgsi_exec_channel *dst,
234           const union tgsi_exec_channel *src)
235{
236   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
237   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
238   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
239   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
240}
241
242static void
243micro_ineg(union tgsi_exec_channel *dst,
244           const union tgsi_exec_channel *src)
245{
246   dst->i[0] = -src->i[0];
247   dst->i[1] = -src->i[1];
248   dst->i[2] = -src->i[2];
249   dst->i[3] = -src->i[3];
250}
251
252static void
253micro_lg2(union tgsi_exec_channel *dst,
254          const union tgsi_exec_channel *src)
255{
256#if FAST_MATH
257   dst->f[0] = util_fast_log2(src->f[0]);
258   dst->f[1] = util_fast_log2(src->f[1]);
259   dst->f[2] = util_fast_log2(src->f[2]);
260   dst->f[3] = util_fast_log2(src->f[3]);
261#else
262   dst->f[0] = logf(src->f[0]) * 1.442695f;
263   dst->f[1] = logf(src->f[1]) * 1.442695f;
264   dst->f[2] = logf(src->f[2]) * 1.442695f;
265   dst->f[3] = logf(src->f[3]) * 1.442695f;
266#endif
267}
268
269static void
270micro_lrp(union tgsi_exec_channel *dst,
271          const union tgsi_exec_channel *src0,
272          const union tgsi_exec_channel *src1,
273          const union tgsi_exec_channel *src2)
274{
275   dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
276   dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
277   dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
278   dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
279}
280
281static void
282micro_mad(union tgsi_exec_channel *dst,
283          const union tgsi_exec_channel *src0,
284          const union tgsi_exec_channel *src1,
285          const union tgsi_exec_channel *src2)
286{
287   dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
288   dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
289   dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
290   dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
291}
292
293static void
294micro_mov(union tgsi_exec_channel *dst,
295          const union tgsi_exec_channel *src)
296{
297   dst->u[0] = src->u[0];
298   dst->u[1] = src->u[1];
299   dst->u[2] = src->u[2];
300   dst->u[3] = src->u[3];
301}
302
303static void
304micro_rcp(union tgsi_exec_channel *dst,
305          const union tgsi_exec_channel *src)
306{
307#if 0 /* for debugging */
308   assert(src->f[0] != 0.0f);
309   assert(src->f[1] != 0.0f);
310   assert(src->f[2] != 0.0f);
311   assert(src->f[3] != 0.0f);
312#endif
313   dst->f[0] = 1.0f / src->f[0];
314   dst->f[1] = 1.0f / src->f[1];
315   dst->f[2] = 1.0f / src->f[2];
316   dst->f[3] = 1.0f / src->f[3];
317}
318
319static void
320micro_rnd(union tgsi_exec_channel *dst,
321          const union tgsi_exec_channel *src)
322{
323   dst->f[0] = floorf(src->f[0] + 0.5f);
324   dst->f[1] = floorf(src->f[1] + 0.5f);
325   dst->f[2] = floorf(src->f[2] + 0.5f);
326   dst->f[3] = floorf(src->f[3] + 0.5f);
327}
328
329static void
330micro_rsq(union tgsi_exec_channel *dst,
331          const union tgsi_exec_channel *src)
332{
333#if 0 /* for debugging */
334   assert(src->f[0] != 0.0f);
335   assert(src->f[1] != 0.0f);
336   assert(src->f[2] != 0.0f);
337   assert(src->f[3] != 0.0f);
338#endif
339   dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
340   dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
341   dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
342   dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
343}
344
345static void
346micro_seq(union tgsi_exec_channel *dst,
347          const union tgsi_exec_channel *src0,
348          const union tgsi_exec_channel *src1)
349{
350   dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
351   dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
352   dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
353   dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
354}
355
356static void
357micro_sge(union tgsi_exec_channel *dst,
358          const union tgsi_exec_channel *src0,
359          const union tgsi_exec_channel *src1)
360{
361   dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
362   dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
363   dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
364   dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
365}
366
367static void
368micro_sgn(union tgsi_exec_channel *dst,
369          const union tgsi_exec_channel *src)
370{
371   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
372   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
373   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
374   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
375}
376
377static void
378micro_sgt(union tgsi_exec_channel *dst,
379          const union tgsi_exec_channel *src0,
380          const union tgsi_exec_channel *src1)
381{
382   dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
383   dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
384   dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
385   dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
386}
387
388static void
389micro_sin(union tgsi_exec_channel *dst,
390          const union tgsi_exec_channel *src)
391{
392   dst->f[0] = sinf(src->f[0]);
393   dst->f[1] = sinf(src->f[1]);
394   dst->f[2] = sinf(src->f[2]);
395   dst->f[3] = sinf(src->f[3]);
396}
397
398static void
399micro_sle(union tgsi_exec_channel *dst,
400          const union tgsi_exec_channel *src0,
401          const union tgsi_exec_channel *src1)
402{
403   dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
404   dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
405   dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
406   dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
407}
408
409static void
410micro_slt(union tgsi_exec_channel *dst,
411          const union tgsi_exec_channel *src0,
412          const union tgsi_exec_channel *src1)
413{
414   dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
415   dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
416   dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
417   dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
418}
419
420static void
421micro_sne(union tgsi_exec_channel *dst,
422          const union tgsi_exec_channel *src0,
423          const union tgsi_exec_channel *src1)
424{
425   dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
426   dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
427   dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
428   dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
429}
430
431static void
432micro_sfl(union tgsi_exec_channel *dst)
433{
434   dst->f[0] = 0.0f;
435   dst->f[1] = 0.0f;
436   dst->f[2] = 0.0f;
437   dst->f[3] = 0.0f;
438}
439
440static void
441micro_str(union tgsi_exec_channel *dst)
442{
443   dst->f[0] = 1.0f;
444   dst->f[1] = 1.0f;
445   dst->f[2] = 1.0f;
446   dst->f[3] = 1.0f;
447}
448
449static void
450micro_trunc(union tgsi_exec_channel *dst,
451            const union tgsi_exec_channel *src)
452{
453   dst->f[0] = (float)(int)src->f[0];
454   dst->f[1] = (float)(int)src->f[1];
455   dst->f[2] = (float)(int)src->f[2];
456   dst->f[3] = (float)(int)src->f[3];
457}
458
459
460#define CHAN_X  0
461#define CHAN_Y  1
462#define CHAN_Z  2
463#define CHAN_W  3
464
465enum tgsi_exec_datatype {
466   TGSI_EXEC_DATA_FLOAT,
467   TGSI_EXEC_DATA_INT,
468   TGSI_EXEC_DATA_UINT
469};
470
471/*
472 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
473 */
474#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
475#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
476#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
477#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
478#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
479#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
480
481
482/** The execution mask depends on the conditional mask and the loop mask */
483#define UPDATE_EXEC_MASK(MACH) \
484      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
485
486
487static const union tgsi_exec_channel ZeroVec =
488   { { 0.0, 0.0, 0.0, 0.0 } };
489
490static const union tgsi_exec_channel OneVec = {
491   {1.0f, 1.0f, 1.0f, 1.0f}
492};
493
494static const union tgsi_exec_channel P128Vec = {
495   {128.0f, 128.0f, 128.0f, 128.0f}
496};
497
498static const union tgsi_exec_channel M128Vec = {
499   {-128.0f, -128.0f, -128.0f, -128.0f}
500};
501
502
503/**
504 * Assert that none of the float values in 'chan' are infinite or NaN.
505 * NaN and Inf may occur normally during program execution and should
506 * not lead to crashes, etc.  But when debugging, it's helpful to catch
507 * them.
508 */
509static INLINE void
510check_inf_or_nan(const union tgsi_exec_channel *chan)
511{
512   assert(!util_is_inf_or_nan((chan)->f[0]));
513   assert(!util_is_inf_or_nan((chan)->f[1]));
514   assert(!util_is_inf_or_nan((chan)->f[2]));
515   assert(!util_is_inf_or_nan((chan)->f[3]));
516}
517
518
519#ifdef DEBUG
520static void
521print_chan(const char *msg, const union tgsi_exec_channel *chan)
522{
523   debug_printf("%s = {%f, %f, %f, %f}\n",
524                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
525}
526#endif
527
528
529#ifdef DEBUG
530static void
531print_temp(const struct tgsi_exec_machine *mach, uint index)
532{
533   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
534   int i;
535   debug_printf("Temp[%u] =\n", index);
536   for (i = 0; i < 4; i++) {
537      debug_printf("  %c: { %f, %f, %f, %f }\n",
538                   "XYZW"[i],
539                   tmp->xyzw[i].f[0],
540                   tmp->xyzw[i].f[1],
541                   tmp->xyzw[i].f[2],
542                   tmp->xyzw[i].f[3]);
543   }
544}
545#endif
546
547
548void
549tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
550                               unsigned num_bufs,
551                               const void **bufs,
552                               const unsigned *buf_sizes)
553{
554   unsigned i;
555
556   for (i = 0; i < num_bufs; i++) {
557      mach->Consts[i] = bufs[i];
558      mach->ConstsSize[i] = buf_sizes[i];
559   }
560}
561
562
563/**
564 * Check if there's a potential src/dst register data dependency when
565 * using SOA execution.
566 * Example:
567 *   MOV T, T.yxwz;
568 * This would expand into:
569 *   MOV t0, t1;
570 *   MOV t1, t0;
571 *   MOV t2, t3;
572 *   MOV t3, t2;
573 * The second instruction will have the wrong value for t0 if executed as-is.
574 */
575boolean
576tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
577{
578   uint i, chan;
579
580   uint writemask = inst->Dst[0].Register.WriteMask;
581   if (writemask == TGSI_WRITEMASK_X ||
582       writemask == TGSI_WRITEMASK_Y ||
583       writemask == TGSI_WRITEMASK_Z ||
584       writemask == TGSI_WRITEMASK_W ||
585       writemask == TGSI_WRITEMASK_NONE) {
586      /* no chance of data dependency */
587      return FALSE;
588   }
589
590   /* loop over src regs */
591   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
592      if ((inst->Src[i].Register.File ==
593           inst->Dst[0].Register.File) &&
594          ((inst->Src[i].Register.Index ==
595            inst->Dst[0].Register.Index) ||
596           inst->Src[i].Register.Indirect ||
597           inst->Dst[0].Register.Indirect)) {
598         /* loop over dest channels */
599         uint channelsWritten = 0x0;
600         for (chan = 0; chan < NUM_CHANNELS; chan++) {
601            if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
602               /* check if we're reading a channel that's been written */
603               uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
604               if (channelsWritten & (1 << swizzle)) {
605                  return TRUE;
606               }
607
608               channelsWritten |= (1 << chan);
609            }
610         }
611      }
612   }
613   return FALSE;
614}
615
616
617/**
618 * Initialize machine state by expanding tokens to full instructions,
619 * allocating temporary storage, setting up constants, etc.
620 * After this, we can call tgsi_exec_machine_run() many times.
621 */
622void
623tgsi_exec_machine_bind_shader(
624   struct tgsi_exec_machine *mach,
625   const struct tgsi_token *tokens,
626   uint numSamplers,
627   struct tgsi_sampler **samplers)
628{
629   uint k;
630   struct tgsi_parse_context parse;
631   struct tgsi_full_instruction *instructions;
632   struct tgsi_full_declaration *declarations;
633   uint maxInstructions = 10, numInstructions = 0;
634   uint maxDeclarations = 10, numDeclarations = 0;
635
636#if 0
637   tgsi_dump(tokens, 0);
638#endif
639
640   util_init_math();
641
642   if (numSamplers) {
643      assert(samplers);
644   }
645
646   mach->Tokens = tokens;
647   mach->Samplers = samplers;
648
649   if (!tokens) {
650      /* unbind and free all */
651      if (mach->Declarations) {
652         FREE( mach->Declarations );
653      }
654      mach->Declarations = NULL;
655      mach->NumDeclarations = 0;
656
657      if (mach->Instructions) {
658         FREE( mach->Instructions );
659      }
660      mach->Instructions = NULL;
661      mach->NumInstructions = 0;
662
663      return;
664   }
665
666   k = tgsi_parse_init (&parse, mach->Tokens);
667   if (k != TGSI_PARSE_OK) {
668      debug_printf( "Problem parsing!\n" );
669      return;
670   }
671
672   mach->Processor = parse.FullHeader.Processor.Processor;
673   mach->ImmLimit = 0;
674
675   declarations = (struct tgsi_full_declaration *)
676      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
677
678   if (!declarations) {
679      return;
680   }
681
682   instructions = (struct tgsi_full_instruction *)
683      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
684
685   if (!instructions) {
686      FREE( declarations );
687      return;
688   }
689
690   while( !tgsi_parse_end_of_tokens( &parse ) ) {
691      uint i;
692
693      tgsi_parse_token( &parse );
694      switch( parse.FullToken.Token.Type ) {
695      case TGSI_TOKEN_TYPE_DECLARATION:
696         /* save expanded declaration */
697         if (numDeclarations == maxDeclarations) {
698            declarations = REALLOC(declarations,
699                                   maxDeclarations
700                                   * sizeof(struct tgsi_full_declaration),
701                                   (maxDeclarations + 10)
702                                   * sizeof(struct tgsi_full_declaration));
703            maxDeclarations += 10;
704         }
705         if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
706            unsigned reg;
707            for (reg = parse.FullToken.FullDeclaration.Range.First;
708                 reg <= parse.FullToken.FullDeclaration.Range.Last;
709                 ++reg) {
710               ++mach->NumOutputs;
711            }
712         }
713         if (parse.FullToken.FullDeclaration.Declaration.File ==
714             TGSI_FILE_IMMEDIATE_ARRAY) {
715            unsigned reg;
716            struct tgsi_full_declaration *decl =
717               &parse.FullToken.FullDeclaration;
718            debug_assert(decl->Range.Last < TGSI_EXEC_NUM_IMMEDIATES);
719            for (reg = decl->Range.First; reg <= decl->Range.Last; ++reg) {
720               for( i = 0; i < 4; i++ ) {
721                  int idx = reg * 4 + i;
722                  mach->ImmArray[reg][i] = decl->ImmediateData.u[idx].Float;
723               }
724            }
725         }
726         memcpy(declarations + numDeclarations,
727                &parse.FullToken.FullDeclaration,
728                sizeof(declarations[0]));
729         numDeclarations++;
730         break;
731
732      case TGSI_TOKEN_TYPE_IMMEDIATE:
733         {
734            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
735            assert( size <= 4 );
736            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
737
738            for( i = 0; i < size; i++ ) {
739               mach->Imms[mach->ImmLimit][i] =
740		  parse.FullToken.FullImmediate.u[i].Float;
741            }
742            mach->ImmLimit += 1;
743         }
744         break;
745
746      case TGSI_TOKEN_TYPE_INSTRUCTION:
747
748         /* save expanded instruction */
749         if (numInstructions == maxInstructions) {
750            instructions = REALLOC(instructions,
751                                   maxInstructions
752                                   * sizeof(struct tgsi_full_instruction),
753                                   (maxInstructions + 10)
754                                   * sizeof(struct tgsi_full_instruction));
755            maxInstructions += 10;
756         }
757
758         memcpy(instructions + numInstructions,
759                &parse.FullToken.FullInstruction,
760                sizeof(instructions[0]));
761
762         numInstructions++;
763         break;
764
765      case TGSI_TOKEN_TYPE_PROPERTY:
766         break;
767
768      default:
769         assert( 0 );
770      }
771   }
772   tgsi_parse_free (&parse);
773
774   if (mach->Declarations) {
775      FREE( mach->Declarations );
776   }
777   mach->Declarations = declarations;
778   mach->NumDeclarations = numDeclarations;
779
780   if (mach->Instructions) {
781      FREE( mach->Instructions );
782   }
783   mach->Instructions = instructions;
784   mach->NumInstructions = numInstructions;
785}
786
787
788struct tgsi_exec_machine *
789tgsi_exec_machine_create( void )
790{
791   struct tgsi_exec_machine *mach;
792   uint i;
793
794   mach = align_malloc( sizeof *mach, 16 );
795   if (!mach)
796      goto fail;
797
798   memset(mach, 0, sizeof(*mach));
799
800   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
801   mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
802   mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
803
804   /* Setup constants needed by the SSE2 executor. */
805   for( i = 0; i < 4; i++ ) {
806      mach->Temps[TGSI_EXEC_TEMP_00000000_I].xyzw[TGSI_EXEC_TEMP_00000000_C].u[i] = 0x00000000;
807      mach->Temps[TGSI_EXEC_TEMP_7FFFFFFF_I].xyzw[TGSI_EXEC_TEMP_7FFFFFFF_C].u[i] = 0x7FFFFFFF;
808      mach->Temps[TGSI_EXEC_TEMP_80000000_I].xyzw[TGSI_EXEC_TEMP_80000000_C].u[i] = 0x80000000;
809      mach->Temps[TGSI_EXEC_TEMP_FFFFFFFF_I].xyzw[TGSI_EXEC_TEMP_FFFFFFFF_C].u[i] = 0xFFFFFFFF;    /* not used */
810      mach->Temps[TGSI_EXEC_TEMP_ONE_I].xyzw[TGSI_EXEC_TEMP_ONE_C].f[i] = 1.0f;
811      mach->Temps[TGSI_EXEC_TEMP_TWO_I].xyzw[TGSI_EXEC_TEMP_TWO_C].f[i] = 2.0f;    /* not used */
812      mach->Temps[TGSI_EXEC_TEMP_128_I].xyzw[TGSI_EXEC_TEMP_128_C].f[i] = 128.0f;
813      mach->Temps[TGSI_EXEC_TEMP_MINUS_128_I].xyzw[TGSI_EXEC_TEMP_MINUS_128_C].f[i] = -128.0f;
814      mach->Temps[TGSI_EXEC_TEMP_THREE_I].xyzw[TGSI_EXEC_TEMP_THREE_C].f[i] = 3.0f;
815      mach->Temps[TGSI_EXEC_TEMP_HALF_I].xyzw[TGSI_EXEC_TEMP_HALF_C].f[i] = 0.5f;
816   }
817
818#ifdef DEBUG
819   /* silence warnings */
820   (void) print_chan;
821   (void) print_temp;
822#endif
823
824   return mach;
825
826fail:
827   align_free(mach);
828   return NULL;
829}
830
831
832void
833tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
834{
835   if (mach) {
836      if (mach->Instructions)
837         FREE(mach->Instructions);
838      if (mach->Declarations)
839      FREE(mach->Declarations);
840   }
841
842   align_free(mach);
843}
844
845static void
846micro_add(union tgsi_exec_channel *dst,
847          const union tgsi_exec_channel *src0,
848          const union tgsi_exec_channel *src1)
849{
850   dst->f[0] = src0->f[0] + src1->f[0];
851   dst->f[1] = src0->f[1] + src1->f[1];
852   dst->f[2] = src0->f[2] + src1->f[2];
853   dst->f[3] = src0->f[3] + src1->f[3];
854}
855
856static void
857micro_div(
858   union tgsi_exec_channel *dst,
859   const union tgsi_exec_channel *src0,
860   const union tgsi_exec_channel *src1 )
861{
862   if (src1->f[0] != 0) {
863      dst->f[0] = src0->f[0] / src1->f[0];
864   }
865   if (src1->f[1] != 0) {
866      dst->f[1] = src0->f[1] / src1->f[1];
867   }
868   if (src1->f[2] != 0) {
869      dst->f[2] = src0->f[2] / src1->f[2];
870   }
871   if (src1->f[3] != 0) {
872      dst->f[3] = src0->f[3] / src1->f[3];
873   }
874}
875
876static void
877micro_rcc(union tgsi_exec_channel *dst,
878          const union tgsi_exec_channel *src)
879{
880   uint i;
881
882   for (i = 0; i < 4; i++) {
883      float recip = 1.0f / src->f[i];
884
885      if (recip > 0.0f) {
886         if (recip > 1.884467e+019f) {
887            dst->f[i] = 1.884467e+019f;
888         }
889         else if (recip < 5.42101e-020f) {
890            dst->f[i] = 5.42101e-020f;
891         }
892         else {
893            dst->f[i] = recip;
894         }
895      }
896      else {
897         if (recip < -1.884467e+019f) {
898            dst->f[i] = -1.884467e+019f;
899         }
900         else if (recip > -5.42101e-020f) {
901            dst->f[i] = -5.42101e-020f;
902         }
903         else {
904            dst->f[i] = recip;
905         }
906      }
907   }
908}
909
910static void
911micro_lt(
912   union tgsi_exec_channel *dst,
913   const union tgsi_exec_channel *src0,
914   const union tgsi_exec_channel *src1,
915   const union tgsi_exec_channel *src2,
916   const union tgsi_exec_channel *src3 )
917{
918   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
919   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
920   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
921   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
922}
923
924static void
925micro_max(union tgsi_exec_channel *dst,
926          const union tgsi_exec_channel *src0,
927          const union tgsi_exec_channel *src1)
928{
929   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
930   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
931   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
932   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
933}
934
935static void
936micro_min(union tgsi_exec_channel *dst,
937          const union tgsi_exec_channel *src0,
938          const union tgsi_exec_channel *src1)
939{
940   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
941   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
942   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
943   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
944}
945
946static void
947micro_mul(union tgsi_exec_channel *dst,
948          const union tgsi_exec_channel *src0,
949          const union tgsi_exec_channel *src1)
950{
951   dst->f[0] = src0->f[0] * src1->f[0];
952   dst->f[1] = src0->f[1] * src1->f[1];
953   dst->f[2] = src0->f[2] * src1->f[2];
954   dst->f[3] = src0->f[3] * src1->f[3];
955}
956
957static void
958micro_neg(
959   union tgsi_exec_channel *dst,
960   const union tgsi_exec_channel *src )
961{
962   dst->f[0] = -src->f[0];
963   dst->f[1] = -src->f[1];
964   dst->f[2] = -src->f[2];
965   dst->f[3] = -src->f[3];
966}
967
968static void
969micro_pow(
970   union tgsi_exec_channel *dst,
971   const union tgsi_exec_channel *src0,
972   const union tgsi_exec_channel *src1 )
973{
974#if FAST_MATH
975   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
976   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
977   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
978   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
979#else
980   dst->f[0] = powf( src0->f[0], src1->f[0] );
981   dst->f[1] = powf( src0->f[1], src1->f[1] );
982   dst->f[2] = powf( src0->f[2], src1->f[2] );
983   dst->f[3] = powf( src0->f[3], src1->f[3] );
984#endif
985}
986
987static void
988micro_sub(union tgsi_exec_channel *dst,
989          const union tgsi_exec_channel *src0,
990          const union tgsi_exec_channel *src1)
991{
992   dst->f[0] = src0->f[0] - src1->f[0];
993   dst->f[1] = src0->f[1] - src1->f[1];
994   dst->f[2] = src0->f[2] - src1->f[2];
995   dst->f[3] = src0->f[3] - src1->f[3];
996}
997
998static void
999fetch_src_file_channel(const struct tgsi_exec_machine *mach,
1000                       const uint file,
1001                       const uint swizzle,
1002                       const union tgsi_exec_channel *index,
1003                       const union tgsi_exec_channel *index2D,
1004                       union tgsi_exec_channel *chan)
1005{
1006   uint i;
1007
1008   assert(swizzle < 4);
1009
1010   switch (file) {
1011   case TGSI_FILE_CONSTANT:
1012      for (i = 0; i < QUAD_SIZE; i++) {
1013         assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
1014         assert(mach->Consts[index2D->i[i]]);
1015
1016         if (index->i[i] < 0) {
1017            chan->u[i] = 0;
1018         } else {
1019            /* NOTE: copying the const value as a uint instead of float */
1020            const uint constbuf = index2D->i[i];
1021            const uint *buf = (const uint *)mach->Consts[constbuf];
1022            const int pos = index->i[i] * 4 + swizzle;
1023            /* const buffer bounds check */
1024            if (pos < 0 || pos >= mach->ConstsSize[constbuf]) {
1025               if (0) {
1026                  /* Debug: print warning */
1027                  static int count = 0;
1028                  if (count++ < 100)
1029                     debug_printf("TGSI Exec: const buffer index %d"
1030                                  " out of bounds\n", pos);
1031               }
1032               chan->u[i] = 0;
1033            }
1034            else
1035               chan->u[i] = buf[pos];
1036         }
1037      }
1038      break;
1039
1040   case TGSI_FILE_INPUT:
1041      for (i = 0; i < QUAD_SIZE; i++) {
1042         /*
1043         if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1044            debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
1045                         index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
1046                         index2D->i[i], index->i[i]);
1047                         }*/
1048         int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
1049         assert(pos >= 0);
1050         assert(pos < Elements(mach->Inputs));
1051         chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
1052      }
1053      break;
1054
1055   case TGSI_FILE_SYSTEM_VALUE:
1056      /* XXX no swizzling at this point.  Will be needed if we put
1057       * gl_FragCoord, for example, in a sys value register.
1058       */
1059      for (i = 0; i < QUAD_SIZE; i++) {
1060         chan->f[i] = mach->SystemValue[index->i[i]][0];
1061      }
1062      break;
1063
1064   case TGSI_FILE_TEMPORARY:
1065      for (i = 0; i < QUAD_SIZE; i++) {
1066         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1067         assert(index2D->i[i] == 0);
1068
1069         chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1070      }
1071      break;
1072
1073   case TGSI_FILE_TEMPORARY_ARRAY:
1074      for (i = 0; i < QUAD_SIZE; i++) {
1075         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1076         assert(index2D->i[i] < TGSI_EXEC_NUM_TEMP_ARRAYS);
1077
1078         chan->u[i] =
1079            mach->TempArray[index2D->i[i]][index->i[i]].xyzw[swizzle].u[i];
1080      }
1081      break;
1082
1083   case TGSI_FILE_IMMEDIATE:
1084      for (i = 0; i < QUAD_SIZE; i++) {
1085         assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1086         assert(index2D->i[i] == 0);
1087
1088         chan->f[i] = mach->Imms[index->i[i]][swizzle];
1089      }
1090      break;
1091
1092   case TGSI_FILE_IMMEDIATE_ARRAY:
1093      for (i = 0; i < QUAD_SIZE; i++) {
1094         assert(index2D->i[i] == 0);
1095
1096         chan->f[i] = mach->ImmArray[index->i[i]][swizzle];
1097      }
1098      break;
1099
1100   case TGSI_FILE_ADDRESS:
1101      for (i = 0; i < QUAD_SIZE; i++) {
1102         assert(index->i[i] >= 0);
1103         assert(index2D->i[i] == 0);
1104
1105         chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1106      }
1107      break;
1108
1109   case TGSI_FILE_PREDICATE:
1110      for (i = 0; i < QUAD_SIZE; i++) {
1111         assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
1112         assert(index2D->i[i] == 0);
1113
1114         chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
1115      }
1116      break;
1117
1118   case TGSI_FILE_OUTPUT:
1119      /* vertex/fragment output vars can be read too */
1120      for (i = 0; i < QUAD_SIZE; i++) {
1121         assert(index->i[i] >= 0);
1122         assert(index2D->i[i] == 0);
1123
1124         chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1125      }
1126      break;
1127
1128   default:
1129      assert(0);
1130      for (i = 0; i < QUAD_SIZE; i++) {
1131         chan->u[i] = 0;
1132      }
1133   }
1134}
1135
1136static void
1137fetch_source(const struct tgsi_exec_machine *mach,
1138             union tgsi_exec_channel *chan,
1139             const struct tgsi_full_src_register *reg,
1140             const uint chan_index,
1141             enum tgsi_exec_datatype src_datatype)
1142{
1143   union tgsi_exec_channel index;
1144   union tgsi_exec_channel index2D;
1145   uint swizzle;
1146
1147   /* We start with a direct index into a register file.
1148    *
1149    *    file[1],
1150    *    where:
1151    *       file = Register.File
1152    *       [1] = Register.Index
1153    */
1154   index.i[0] =
1155   index.i[1] =
1156   index.i[2] =
1157   index.i[3] = reg->Register.Index;
1158
1159   /* There is an extra source register that indirectly subscripts
1160    * a register file. The direct index now becomes an offset
1161    * that is being added to the indirect register.
1162    *
1163    *    file[ind[2].x+1],
1164    *    where:
1165    *       ind = Indirect.File
1166    *       [2] = Indirect.Index
1167    *       .x = Indirect.SwizzleX
1168    */
1169   if (reg->Register.Indirect) {
1170      union tgsi_exec_channel index2;
1171      union tgsi_exec_channel indir_index;
1172      const uint execmask = mach->ExecMask;
1173      uint i;
1174
1175      /* which address register (always zero now) */
1176      index2.i[0] =
1177      index2.i[1] =
1178      index2.i[2] =
1179      index2.i[3] = reg->Indirect.Index;
1180      assert(reg->Indirect.File == TGSI_FILE_ADDRESS);
1181      /* get current value of address register[swizzle] */
1182      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1183      fetch_src_file_channel(mach,
1184                             reg->Indirect.File,
1185                             swizzle,
1186                             &index2,
1187                             &ZeroVec,
1188                             &indir_index);
1189
1190      /* add value of address register to the offset */
1191      index.i[0] += indir_index.i[0];
1192      index.i[1] += indir_index.i[1];
1193      index.i[2] += indir_index.i[2];
1194      index.i[3] += indir_index.i[3];
1195
1196      /* for disabled execution channels, zero-out the index to
1197       * avoid using a potential garbage value.
1198       */
1199      for (i = 0; i < QUAD_SIZE; i++) {
1200         if ((execmask & (1 << i)) == 0)
1201            index.i[i] = 0;
1202      }
1203   }
1204
1205   /* There is an extra source register that is a second
1206    * subscript to a register file. Effectively it means that
1207    * the register file is actually a 2D array of registers.
1208    *
1209    *    file[3][1],
1210    *    where:
1211    *       [3] = Dimension.Index
1212    */
1213   if (reg->Register.Dimension) {
1214      index2D.i[0] =
1215      index2D.i[1] =
1216      index2D.i[2] =
1217      index2D.i[3] = reg->Dimension.Index;
1218
1219      /* Again, the second subscript index can be addressed indirectly
1220       * identically to the first one.
1221       * Nothing stops us from indirectly addressing the indirect register,
1222       * but there is no need for that, so we won't exercise it.
1223       *
1224       *    file[ind[4].y+3][1],
1225       *    where:
1226       *       ind = DimIndirect.File
1227       *       [4] = DimIndirect.Index
1228       *       .y = DimIndirect.SwizzleX
1229       */
1230      if (reg->Dimension.Indirect) {
1231         union tgsi_exec_channel index2;
1232         union tgsi_exec_channel indir_index;
1233         const uint execmask = mach->ExecMask;
1234         uint i;
1235
1236         index2.i[0] =
1237         index2.i[1] =
1238         index2.i[2] =
1239         index2.i[3] = reg->DimIndirect.Index;
1240
1241         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1242         fetch_src_file_channel(mach,
1243                                reg->DimIndirect.File,
1244                                swizzle,
1245                                &index2,
1246                                &ZeroVec,
1247                                &indir_index);
1248
1249         index2D.i[0] += indir_index.i[0];
1250         index2D.i[1] += indir_index.i[1];
1251         index2D.i[2] += indir_index.i[2];
1252         index2D.i[3] += indir_index.i[3];
1253
1254         /* for disabled execution channels, zero-out the index to
1255          * avoid using a potential garbage value.
1256          */
1257         for (i = 0; i < QUAD_SIZE; i++) {
1258            if ((execmask & (1 << i)) == 0) {
1259               index2D.i[i] = 0;
1260            }
1261         }
1262      }
1263
1264      /* If by any chance there was a need for a 3D array of register
1265       * files, we would have to check whether Dimension is followed
1266       * by a dimension register and continue the saga.
1267       */
1268   } else {
1269      index2D.i[0] =
1270      index2D.i[1] =
1271      index2D.i[2] =
1272      index2D.i[3] = 0;
1273   }
1274
1275   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1276   fetch_src_file_channel(mach,
1277                          reg->Register.File,
1278                          swizzle,
1279                          &index,
1280                          &index2D,
1281                          chan);
1282
1283   if (reg->Register.Absolute) {
1284      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1285         micro_abs(chan, chan);
1286      } else {
1287         micro_iabs(chan, chan);
1288      }
1289   }
1290
1291   if (reg->Register.Negate) {
1292      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1293         micro_neg(chan, chan);
1294      } else {
1295         micro_ineg(chan, chan);
1296      }
1297   }
1298}
1299
1300static void
1301store_dest(struct tgsi_exec_machine *mach,
1302           const union tgsi_exec_channel *chan,
1303           const struct tgsi_full_dst_register *reg,
1304           const struct tgsi_full_instruction *inst,
1305           uint chan_index,
1306           enum tgsi_exec_datatype dst_datatype)
1307{
1308   uint i;
1309   union tgsi_exec_channel null;
1310   union tgsi_exec_channel *dst;
1311   union tgsi_exec_channel index2D;
1312   uint execmask = mach->ExecMask;
1313   int offset = 0;  /* indirection offset */
1314   int index;
1315
1316   /* for debugging */
1317   if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1318      check_inf_or_nan(chan);
1319   }
1320
1321   /* There is an extra source register that indirectly subscripts
1322    * a register file. The direct index now becomes an offset
1323    * that is being added to the indirect register.
1324    *
1325    *    file[ind[2].x+1],
1326    *    where:
1327    *       ind = Indirect.File
1328    *       [2] = Indirect.Index
1329    *       .x = Indirect.SwizzleX
1330    */
1331   if (reg->Register.Indirect) {
1332      union tgsi_exec_channel index;
1333      union tgsi_exec_channel indir_index;
1334      uint swizzle;
1335
1336      /* which address register (always zero for now) */
1337      index.i[0] =
1338      index.i[1] =
1339      index.i[2] =
1340      index.i[3] = reg->Indirect.Index;
1341
1342      /* get current value of address register[swizzle] */
1343      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1344
1345      /* fetch values from the address/indirection register */
1346      fetch_src_file_channel(mach,
1347                             reg->Indirect.File,
1348                             swizzle,
1349                             &index,
1350                             &ZeroVec,
1351                             &indir_index);
1352
1353      /* save indirection offset */
1354      offset = indir_index.i[0];
1355   }
1356
1357   /* There is an extra source register that is a second
1358    * subscript to a register file. Effectively it means that
1359    * the register file is actually a 2D array of registers.
1360    *
1361    *    file[3][1],
1362    *    where:
1363    *       [3] = Dimension.Index
1364    */
1365   if (reg->Register.Dimension) {
1366      index2D.i[0] =
1367      index2D.i[1] =
1368      index2D.i[2] =
1369      index2D.i[3] = reg->Dimension.Index;
1370
1371      /* Again, the second subscript index can be addressed indirectly
1372       * identically to the first one.
1373       * Nothing stops us from indirectly addressing the indirect register,
1374       * but there is no need for that, so we won't exercise it.
1375       *
1376       *    file[ind[4].y+3][1],
1377       *    where:
1378       *       ind = DimIndirect.File
1379       *       [4] = DimIndirect.Index
1380       *       .y = DimIndirect.SwizzleX
1381       */
1382      if (reg->Dimension.Indirect) {
1383         union tgsi_exec_channel index2;
1384         union tgsi_exec_channel indir_index;
1385         const uint execmask = mach->ExecMask;
1386         unsigned swizzle;
1387         uint i;
1388
1389         index2.i[0] =
1390         index2.i[1] =
1391         index2.i[2] =
1392         index2.i[3] = reg->DimIndirect.Index;
1393
1394         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1395         fetch_src_file_channel(mach,
1396                                reg->DimIndirect.File,
1397                                swizzle,
1398                                &index2,
1399                                &ZeroVec,
1400                                &indir_index);
1401
1402         index2D.i[0] += indir_index.i[0];
1403         index2D.i[1] += indir_index.i[1];
1404         index2D.i[2] += indir_index.i[2];
1405         index2D.i[3] += indir_index.i[3];
1406
1407         /* for disabled execution channels, zero-out the index to
1408          * avoid using a potential garbage value.
1409          */
1410         for (i = 0; i < QUAD_SIZE; i++) {
1411            if ((execmask & (1 << i)) == 0) {
1412               index2D.i[i] = 0;
1413            }
1414         }
1415      }
1416
1417      /* If by any chance there was a need for a 3D array of register
1418       * files, we would have to check whether Dimension is followed
1419       * by a dimension register and continue the saga.
1420       */
1421   } else {
1422      index2D.i[0] =
1423      index2D.i[1] =
1424      index2D.i[2] =
1425      index2D.i[3] = 0;
1426   }
1427
1428   switch (reg->Register.File) {
1429   case TGSI_FILE_NULL:
1430      dst = &null;
1431      break;
1432
1433   case TGSI_FILE_OUTPUT:
1434      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1435         + reg->Register.Index;
1436      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1437#if 0
1438      if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1439         fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1440         for (i = 0; i < QUAD_SIZE; i++)
1441            if (execmask & (1 << i))
1442               fprintf(stderr, "%f, ", chan->f[i]);
1443         fprintf(stderr, ")\n");
1444      }
1445#endif
1446      break;
1447
1448   case TGSI_FILE_TEMPORARY:
1449      index = reg->Register.Index;
1450      assert( index < TGSI_EXEC_NUM_TEMPS );
1451      dst = &mach->Temps[offset + index].xyzw[chan_index];
1452      break;
1453
1454   case TGSI_FILE_TEMPORARY_ARRAY:
1455      index = reg->Register.Index;
1456      assert( index < TGSI_EXEC_NUM_TEMPS );
1457      assert( index2D.i[0] < TGSI_EXEC_NUM_TEMP_ARRAYS );
1458      /* XXX we use index2D.i[0] here but somehow we might
1459       * end up with someone trying to store indirectly in
1460       * different buffers */
1461      dst = &mach->TempArray[index2D.i[0]][offset + index].xyzw[chan_index];
1462      break;
1463
1464   case TGSI_FILE_ADDRESS:
1465      index = reg->Register.Index;
1466      dst = &mach->Addrs[index].xyzw[chan_index];
1467      break;
1468
1469   case TGSI_FILE_PREDICATE:
1470      index = reg->Register.Index;
1471      assert(index < TGSI_EXEC_NUM_PREDS);
1472      dst = &mach->Predicates[index].xyzw[chan_index];
1473      break;
1474
1475   default:
1476      assert( 0 );
1477      return;
1478   }
1479
1480   if (inst->Instruction.Predicate) {
1481      uint swizzle;
1482      union tgsi_exec_channel *pred;
1483
1484      switch (chan_index) {
1485      case CHAN_X:
1486         swizzle = inst->Predicate.SwizzleX;
1487         break;
1488      case CHAN_Y:
1489         swizzle = inst->Predicate.SwizzleY;
1490         break;
1491      case CHAN_Z:
1492         swizzle = inst->Predicate.SwizzleZ;
1493         break;
1494      case CHAN_W:
1495         swizzle = inst->Predicate.SwizzleW;
1496         break;
1497      default:
1498         assert(0);
1499         return;
1500      }
1501
1502      assert(inst->Predicate.Index == 0);
1503
1504      pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1505
1506      if (inst->Predicate.Negate) {
1507         for (i = 0; i < QUAD_SIZE; i++) {
1508            if (pred->u[i]) {
1509               execmask &= ~(1 << i);
1510            }
1511         }
1512      } else {
1513         for (i = 0; i < QUAD_SIZE; i++) {
1514            if (!pred->u[i]) {
1515               execmask &= ~(1 << i);
1516            }
1517         }
1518      }
1519   }
1520
1521   switch (inst->Instruction.Saturate) {
1522   case TGSI_SAT_NONE:
1523      for (i = 0; i < QUAD_SIZE; i++)
1524         if (execmask & (1 << i))
1525            dst->i[i] = chan->i[i];
1526      break;
1527
1528   case TGSI_SAT_ZERO_ONE:
1529      for (i = 0; i < QUAD_SIZE; i++)
1530         if (execmask & (1 << i)) {
1531            if (chan->f[i] < 0.0f)
1532               dst->f[i] = 0.0f;
1533            else if (chan->f[i] > 1.0f)
1534               dst->f[i] = 1.0f;
1535            else
1536               dst->i[i] = chan->i[i];
1537         }
1538      break;
1539
1540   case TGSI_SAT_MINUS_PLUS_ONE:
1541      for (i = 0; i < QUAD_SIZE; i++)
1542         if (execmask & (1 << i)) {
1543            if (chan->f[i] < -1.0f)
1544               dst->f[i] = -1.0f;
1545            else if (chan->f[i] > 1.0f)
1546               dst->f[i] = 1.0f;
1547            else
1548               dst->i[i] = chan->i[i];
1549         }
1550      break;
1551
1552   default:
1553      assert( 0 );
1554   }
1555}
1556
1557#define FETCH(VAL,INDEX,CHAN)\
1558    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1559
1560
1561/**
1562 * Execute ARB-style KIL which is predicated by a src register.
1563 * Kill fragment if any of the four values is less than zero.
1564 */
1565static void
1566exec_kil(struct tgsi_exec_machine *mach,
1567         const struct tgsi_full_instruction *inst)
1568{
1569   uint uniquemask;
1570   uint chan_index;
1571   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1572   union tgsi_exec_channel r[1];
1573
1574   /* This mask stores component bits that were already tested. */
1575   uniquemask = 0;
1576
1577   for (chan_index = 0; chan_index < 4; chan_index++)
1578   {
1579      uint swizzle;
1580      uint i;
1581
1582      /* unswizzle channel */
1583      swizzle = tgsi_util_get_full_src_register_swizzle (
1584                        &inst->Src[0],
1585                        chan_index);
1586
1587      /* check if the component has not been already tested */
1588      if (uniquemask & (1 << swizzle))
1589         continue;
1590      uniquemask |= 1 << swizzle;
1591
1592      FETCH(&r[0], 0, chan_index);
1593      for (i = 0; i < 4; i++)
1594         if (r[0].f[i] < 0.0f)
1595            kilmask |= 1 << i;
1596   }
1597
1598   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1599}
1600
1601/**
1602 * Execute NVIDIA-style KIL which is predicated by a condition code.
1603 * Kill fragment if the condition code is TRUE.
1604 */
1605static void
1606exec_kilp(struct tgsi_exec_machine *mach,
1607          const struct tgsi_full_instruction *inst)
1608{
1609   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1610
1611   /* "unconditional" kil */
1612   kilmask = mach->ExecMask;
1613   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1614}
1615
1616static void
1617emit_vertex(struct tgsi_exec_machine *mach)
1618{
1619   /* FIXME: check for exec mask correctly
1620   unsigned i;
1621   for (i = 0; i < QUAD_SIZE; ++i) {
1622         if ((mach->ExecMask & (1 << i)))
1623   */
1624   if (mach->ExecMask) {
1625      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1626      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1627   }
1628}
1629
1630static void
1631emit_primitive(struct tgsi_exec_machine *mach)
1632{
1633   unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1634   /* FIXME: check for exec mask correctly
1635   unsigned i;
1636   for (i = 0; i < QUAD_SIZE; ++i) {
1637         if ((mach->ExecMask & (1 << i)))
1638   */
1639   if (mach->ExecMask) {
1640      ++(*prim_count);
1641      debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1642      mach->Primitives[*prim_count] = 0;
1643   }
1644}
1645
1646static void
1647conditional_emit_primitive(struct tgsi_exec_machine *mach)
1648{
1649   if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1650      int emitted_verts =
1651         mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]];
1652      if (emitted_verts) {
1653         emit_primitive(mach);
1654      }
1655   }
1656}
1657
1658
1659/*
1660 * Fetch four texture samples using STR texture coordinates.
1661 */
1662static void
1663fetch_texel( struct tgsi_sampler *sampler,
1664             const union tgsi_exec_channel *s,
1665             const union tgsi_exec_channel *t,
1666             const union tgsi_exec_channel *p,
1667             const union tgsi_exec_channel *c0,
1668             enum tgsi_sampler_control control,
1669             union tgsi_exec_channel *r,
1670             union tgsi_exec_channel *g,
1671             union tgsi_exec_channel *b,
1672             union tgsi_exec_channel *a )
1673{
1674   uint j;
1675   float rgba[NUM_CHANNELS][QUAD_SIZE];
1676
1677   sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
1678
1679   for (j = 0; j < 4; j++) {
1680      r->f[j] = rgba[0][j];
1681      g->f[j] = rgba[1][j];
1682      b->f[j] = rgba[2][j];
1683      a->f[j] = rgba[3][j];
1684   }
1685}
1686
1687
1688#define TEX_MODIFIER_NONE           0
1689#define TEX_MODIFIER_PROJECTED      1
1690#define TEX_MODIFIER_LOD_BIAS       2
1691#define TEX_MODIFIER_EXPLICIT_LOD   3
1692
1693
1694static void
1695exec_tex(struct tgsi_exec_machine *mach,
1696         const struct tgsi_full_instruction *inst,
1697         uint modifier)
1698{
1699   const uint unit = inst->Src[1].Register.Index;
1700   union tgsi_exec_channel r[4];
1701   const union tgsi_exec_channel *lod = &ZeroVec;
1702   enum tgsi_sampler_control control;
1703   uint chan;
1704
1705   if (modifier != TEX_MODIFIER_NONE) {
1706      FETCH(&r[3], 0, CHAN_W);
1707      if (modifier != TEX_MODIFIER_PROJECTED) {
1708         lod = &r[3];
1709      }
1710   }
1711
1712   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
1713      control = tgsi_sampler_lod_explicit;
1714   } else {
1715      control = tgsi_sampler_lod_bias;
1716   }
1717
1718   switch (inst->Texture.Texture) {
1719   case TGSI_TEXTURE_1D:
1720   case TGSI_TEXTURE_SHADOW1D:
1721      FETCH(&r[0], 0, CHAN_X);
1722
1723      if (modifier == TEX_MODIFIER_PROJECTED) {
1724         micro_div(&r[0], &r[0], &r[3]);
1725      }
1726
1727      fetch_texel(mach->Samplers[unit],
1728                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
1729                  control,
1730                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1731      break;
1732
1733   case TGSI_TEXTURE_2D:
1734   case TGSI_TEXTURE_RECT:
1735   case TGSI_TEXTURE_SHADOW2D:
1736   case TGSI_TEXTURE_SHADOWRECT:
1737      FETCH(&r[0], 0, CHAN_X);
1738      FETCH(&r[1], 0, CHAN_Y);
1739      FETCH(&r[2], 0, CHAN_Z);
1740
1741      if (modifier == TEX_MODIFIER_PROJECTED) {
1742         micro_div(&r[0], &r[0], &r[3]);
1743         micro_div(&r[1], &r[1], &r[3]);
1744         micro_div(&r[2], &r[2], &r[3]);
1745      }
1746
1747      fetch_texel(mach->Samplers[unit],
1748                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1749                  control,
1750                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1751      break;
1752
1753   case TGSI_TEXTURE_1D_ARRAY:
1754      FETCH(&r[0], 0, CHAN_X);
1755      FETCH(&r[1], 0, CHAN_Y);
1756
1757      if (modifier == TEX_MODIFIER_PROJECTED) {
1758         micro_div(&r[0], &r[0], &r[3]);
1759      }
1760
1761      fetch_texel(mach->Samplers[unit],
1762                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1763                  control,
1764                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1765      break;
1766
1767   case TGSI_TEXTURE_2D_ARRAY:
1768      FETCH(&r[0], 0, CHAN_X);
1769      FETCH(&r[1], 0, CHAN_Y);
1770      FETCH(&r[2], 0, CHAN_Z);
1771
1772      if (modifier == TEX_MODIFIER_PROJECTED) {
1773         micro_div(&r[0], &r[0], &r[3]);
1774         micro_div(&r[1], &r[1], &r[3]);
1775      }
1776
1777      fetch_texel(mach->Samplers[unit],
1778                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1779                  control,
1780                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1781      break;
1782
1783   case TGSI_TEXTURE_3D:
1784   case TGSI_TEXTURE_CUBE:
1785      FETCH(&r[0], 0, CHAN_X);
1786      FETCH(&r[1], 0, CHAN_Y);
1787      FETCH(&r[2], 0, CHAN_Z);
1788
1789      if (modifier == TEX_MODIFIER_PROJECTED) {
1790         micro_div(&r[0], &r[0], &r[3]);
1791         micro_div(&r[1], &r[1], &r[3]);
1792         micro_div(&r[2], &r[2], &r[3]);
1793      }
1794
1795      fetch_texel(mach->Samplers[unit],
1796                  &r[0], &r[1], &r[2], lod,
1797                  control,
1798                  &r[0], &r[1], &r[2], &r[3]);
1799      break;
1800
1801   default:
1802      assert(0);
1803   }
1804
1805   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1806      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1807         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1808      }
1809   }
1810}
1811
1812static void
1813exec_txd(struct tgsi_exec_machine *mach,
1814         const struct tgsi_full_instruction *inst)
1815{
1816   const uint unit = inst->Src[3].Register.Index;
1817   union tgsi_exec_channel r[4];
1818   uint chan;
1819
1820   /*
1821    * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1822    */
1823
1824   switch (inst->Texture.Texture) {
1825   case TGSI_TEXTURE_1D:
1826   case TGSI_TEXTURE_SHADOW1D:
1827
1828      FETCH(&r[0], 0, CHAN_X);
1829
1830      fetch_texel(mach->Samplers[unit],
1831                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
1832                  tgsi_sampler_lod_bias,
1833                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
1834      break;
1835
1836   case TGSI_TEXTURE_2D:
1837   case TGSI_TEXTURE_RECT:
1838   case TGSI_TEXTURE_SHADOW2D:
1839   case TGSI_TEXTURE_SHADOWRECT:
1840
1841      FETCH(&r[0], 0, CHAN_X);
1842      FETCH(&r[1], 0, CHAN_Y);
1843      FETCH(&r[2], 0, CHAN_Z);
1844
1845      fetch_texel(mach->Samplers[unit],
1846                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
1847                  tgsi_sampler_lod_bias,
1848                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
1849      break;
1850
1851   case TGSI_TEXTURE_3D:
1852   case TGSI_TEXTURE_CUBE:
1853
1854      FETCH(&r[0], 0, CHAN_X);
1855      FETCH(&r[1], 0, CHAN_Y);
1856      FETCH(&r[2], 0, CHAN_Z);
1857
1858      fetch_texel(mach->Samplers[unit],
1859                  &r[0], &r[1], &r[2], &ZeroVec,
1860                  tgsi_sampler_lod_bias,
1861                  &r[0], &r[1], &r[2], &r[3]);
1862      break;
1863
1864   default:
1865      assert(0);
1866   }
1867
1868   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1869      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1870         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1871      }
1872   }
1873}
1874
1875
1876
1877static void
1878exec_sample(struct tgsi_exec_machine *mach,
1879            const struct tgsi_full_instruction *inst,
1880            uint modifier)
1881{
1882   const uint resource_unit = inst->Src[1].Register.Index;
1883   const uint sampler_unit = inst->Src[2].Register.Index;
1884   union tgsi_exec_channel r[4];
1885   const union tgsi_exec_channel *lod = &ZeroVec;
1886   enum tgsi_sampler_control control;
1887   uint chan;
1888
1889   if (modifier != TEX_MODIFIER_NONE) {
1890      if (modifier == TEX_MODIFIER_LOD_BIAS)
1891         FETCH(&r[3], 3, CHAN_X);
1892      else /*TEX_MODIFIER_LOD*/
1893         FETCH(&r[3], 0, CHAN_W);
1894
1895      if (modifier != TEX_MODIFIER_PROJECTED) {
1896         lod = &r[3];
1897      }
1898   }
1899
1900   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
1901      control = tgsi_sampler_lod_explicit;
1902   } else {
1903      control = tgsi_sampler_lod_bias;
1904   }
1905
1906   switch (mach->Resources[resource_unit].Resource) {
1907   case TGSI_TEXTURE_1D:
1908   case TGSI_TEXTURE_SHADOW1D:
1909      FETCH(&r[0], 0, CHAN_X);
1910
1911      if (modifier == TEX_MODIFIER_PROJECTED) {
1912         micro_div(&r[0], &r[0], &r[3]);
1913      }
1914
1915      fetch_texel(mach->Samplers[sampler_unit],
1916                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
1917                  control,
1918                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1919      break;
1920
1921   case TGSI_TEXTURE_2D:
1922   case TGSI_TEXTURE_RECT:
1923   case TGSI_TEXTURE_SHADOW2D:
1924   case TGSI_TEXTURE_SHADOWRECT:
1925      FETCH(&r[0], 0, CHAN_X);
1926      FETCH(&r[1], 0, CHAN_Y);
1927      FETCH(&r[2], 0, CHAN_Z);
1928
1929      if (modifier == TEX_MODIFIER_PROJECTED) {
1930         micro_div(&r[0], &r[0], &r[3]);
1931         micro_div(&r[1], &r[1], &r[3]);
1932         micro_div(&r[2], &r[2], &r[3]);
1933      }
1934
1935      fetch_texel(mach->Samplers[sampler_unit],
1936                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1937                  control,
1938                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1939      break;
1940
1941   case TGSI_TEXTURE_3D:
1942   case TGSI_TEXTURE_CUBE:
1943      FETCH(&r[0], 0, CHAN_X);
1944      FETCH(&r[1], 0, CHAN_Y);
1945      FETCH(&r[2], 0, CHAN_Z);
1946
1947      if (modifier == TEX_MODIFIER_PROJECTED) {
1948         micro_div(&r[0], &r[0], &r[3]);
1949         micro_div(&r[1], &r[1], &r[3]);
1950         micro_div(&r[2], &r[2], &r[3]);
1951      }
1952
1953      fetch_texel(mach->Samplers[sampler_unit],
1954                  &r[0], &r[1], &r[2], lod,
1955                  control,
1956                  &r[0], &r[1], &r[2], &r[3]);
1957      break;
1958
1959   default:
1960      assert(0);
1961   }
1962
1963   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1964      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1965         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1966      }
1967   }
1968}
1969
1970static void
1971exec_sample_d(struct tgsi_exec_machine *mach,
1972              const struct tgsi_full_instruction *inst)
1973{
1974   const uint resource_unit = inst->Src[1].Register.Index;
1975   const uint sampler_unit = inst->Src[2].Register.Index;
1976   union tgsi_exec_channel r[4];
1977   uint chan;
1978   /*
1979    * XXX: This is fake SAMPLE_D -- the derivatives are not taken into account, yet.
1980    */
1981
1982   switch (mach->Resources[resource_unit].Resource) {
1983   case TGSI_TEXTURE_1D:
1984   case TGSI_TEXTURE_SHADOW1D:
1985
1986      FETCH(&r[0], 0, CHAN_X);
1987
1988      fetch_texel(mach->Samplers[sampler_unit],
1989                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
1990                  tgsi_sampler_lod_bias,
1991                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
1992      break;
1993
1994   case TGSI_TEXTURE_2D:
1995   case TGSI_TEXTURE_RECT:
1996   case TGSI_TEXTURE_SHADOW2D:
1997   case TGSI_TEXTURE_SHADOWRECT:
1998
1999      FETCH(&r[0], 0, CHAN_X);
2000      FETCH(&r[1], 0, CHAN_Y);
2001      FETCH(&r[2], 0, CHAN_Z);
2002
2003      fetch_texel(mach->Samplers[sampler_unit],
2004                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
2005                  tgsi_sampler_lod_bias,
2006                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2007      break;
2008
2009   case TGSI_TEXTURE_3D:
2010   case TGSI_TEXTURE_CUBE:
2011
2012      FETCH(&r[0], 0, CHAN_X);
2013      FETCH(&r[1], 0, CHAN_Y);
2014      FETCH(&r[2], 0, CHAN_Z);
2015
2016      fetch_texel(mach->Samplers[sampler_unit],
2017                  &r[0], &r[1], &r[2], &ZeroVec,
2018                  tgsi_sampler_lod_bias,
2019                  &r[0], &r[1], &r[2], &r[3]);
2020      break;
2021
2022   default:
2023      assert(0);
2024   }
2025
2026   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2027      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2028         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2029      }
2030   }
2031}
2032
2033
2034/**
2035 * Evaluate a constant-valued coefficient at the position of the
2036 * current quad.
2037 */
2038static void
2039eval_constant_coef(
2040   struct tgsi_exec_machine *mach,
2041   unsigned attrib,
2042   unsigned chan )
2043{
2044   unsigned i;
2045
2046   for( i = 0; i < QUAD_SIZE; i++ ) {
2047      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
2048   }
2049}
2050
2051/**
2052 * Evaluate a linear-valued coefficient at the position of the
2053 * current quad.
2054 */
2055static void
2056eval_linear_coef(
2057   struct tgsi_exec_machine *mach,
2058   unsigned attrib,
2059   unsigned chan )
2060{
2061   const float x = mach->QuadPos.xyzw[0].f[0];
2062   const float y = mach->QuadPos.xyzw[1].f[0];
2063   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2064   const float dady = mach->InterpCoefs[attrib].dady[chan];
2065   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2066   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
2067   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
2068   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
2069   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
2070}
2071
2072/**
2073 * Evaluate a perspective-valued coefficient at the position of the
2074 * current quad.
2075 */
2076static void
2077eval_perspective_coef(
2078   struct tgsi_exec_machine *mach,
2079   unsigned attrib,
2080   unsigned chan )
2081{
2082   const float x = mach->QuadPos.xyzw[0].f[0];
2083   const float y = mach->QuadPos.xyzw[1].f[0];
2084   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2085   const float dady = mach->InterpCoefs[attrib].dady[chan];
2086   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2087   const float *w = mach->QuadPos.xyzw[3].f;
2088   /* divide by W here */
2089   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
2090   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
2091   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
2092   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
2093}
2094
2095
2096typedef void (* eval_coef_func)(
2097   struct tgsi_exec_machine *mach,
2098   unsigned attrib,
2099   unsigned chan );
2100
2101static void
2102exec_declaration(struct tgsi_exec_machine *mach,
2103                 const struct tgsi_full_declaration *decl)
2104{
2105   if (decl->Declaration.File == TGSI_FILE_RESOURCE) {
2106      mach->Resources[decl->Range.First] = decl->Resource;
2107      return;
2108   }
2109
2110   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
2111      if (decl->Declaration.File == TGSI_FILE_INPUT) {
2112         uint first, last, mask;
2113
2114         first = decl->Range.First;
2115         last = decl->Range.Last;
2116         mask = decl->Declaration.UsageMask;
2117
2118         /* XXX we could remove this special-case code since
2119          * mach->InterpCoefs[first].a0 should already have the
2120          * front/back-face value.  But we should first update the
2121          * ureg code to emit the right UsageMask value (WRITEMASK_X).
2122          * Then, we could remove the tgsi_exec_machine::Face field.
2123          */
2124         /* XXX make FACE a system value */
2125         if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
2126            uint i;
2127
2128            assert(decl->Semantic.Index == 0);
2129            assert(first == last);
2130
2131            for (i = 0; i < QUAD_SIZE; i++) {
2132               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
2133            }
2134         } else {
2135            eval_coef_func eval;
2136            uint i, j;
2137
2138            switch (decl->Declaration.Interpolate) {
2139            case TGSI_INTERPOLATE_CONSTANT:
2140               eval = eval_constant_coef;
2141               break;
2142
2143            case TGSI_INTERPOLATE_LINEAR:
2144               eval = eval_linear_coef;
2145               break;
2146
2147            case TGSI_INTERPOLATE_PERSPECTIVE:
2148               eval = eval_perspective_coef;
2149               break;
2150
2151            default:
2152               assert(0);
2153               return;
2154            }
2155
2156            for (j = 0; j < NUM_CHANNELS; j++) {
2157               if (mask & (1 << j)) {
2158                  for (i = first; i <= last; i++) {
2159                     eval(mach, i, j);
2160                  }
2161               }
2162            }
2163         }
2164      }
2165   }
2166
2167   if (decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
2168      mach->SysSemanticToIndex[decl->Declaration.Semantic] = decl->Range.First;
2169   }
2170}
2171
2172
2173typedef void (* micro_op)(union tgsi_exec_channel *dst);
2174
2175static void
2176exec_vector(struct tgsi_exec_machine *mach,
2177            const struct tgsi_full_instruction *inst,
2178            micro_op op,
2179            enum tgsi_exec_datatype dst_datatype)
2180{
2181   unsigned int chan;
2182
2183   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2184      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2185         union tgsi_exec_channel dst;
2186
2187         op(&dst);
2188         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2189      }
2190   }
2191}
2192
2193typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
2194                                const union tgsi_exec_channel *src);
2195
2196static void
2197exec_scalar_unary(struct tgsi_exec_machine *mach,
2198                  const struct tgsi_full_instruction *inst,
2199                  micro_unary_op op,
2200                  enum tgsi_exec_datatype dst_datatype,
2201                  enum tgsi_exec_datatype src_datatype)
2202{
2203   unsigned int chan;
2204   union tgsi_exec_channel src;
2205   union tgsi_exec_channel dst;
2206
2207   fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
2208   op(&dst, &src);
2209   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2210      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2211         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2212      }
2213   }
2214}
2215
2216static void
2217exec_vector_unary(struct tgsi_exec_machine *mach,
2218                  const struct tgsi_full_instruction *inst,
2219                  micro_unary_op op,
2220                  enum tgsi_exec_datatype dst_datatype,
2221                  enum tgsi_exec_datatype src_datatype)
2222{
2223   unsigned int chan;
2224   struct tgsi_exec_vector dst;
2225
2226   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2227      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2228         union tgsi_exec_channel src;
2229
2230         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
2231         op(&dst.xyzw[chan], &src);
2232      }
2233   }
2234   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2235      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2236         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2237      }
2238   }
2239}
2240
2241typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
2242                                 const union tgsi_exec_channel *src0,
2243                                 const union tgsi_exec_channel *src1);
2244
2245static void
2246exec_scalar_binary(struct tgsi_exec_machine *mach,
2247                   const struct tgsi_full_instruction *inst,
2248                   micro_binary_op op,
2249                   enum tgsi_exec_datatype dst_datatype,
2250                   enum tgsi_exec_datatype src_datatype)
2251{
2252   unsigned int chan;
2253   union tgsi_exec_channel src[2];
2254   union tgsi_exec_channel dst;
2255
2256   fetch_source(mach, &src[0], &inst->Src[0], CHAN_X, src_datatype);
2257   fetch_source(mach, &src[1], &inst->Src[1], CHAN_Y, src_datatype);
2258   op(&dst, &src[0], &src[1]);
2259   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2260      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2261         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2262      }
2263   }
2264}
2265
2266static void
2267exec_vector_binary(struct tgsi_exec_machine *mach,
2268                   const struct tgsi_full_instruction *inst,
2269                   micro_binary_op op,
2270                   enum tgsi_exec_datatype dst_datatype,
2271                   enum tgsi_exec_datatype src_datatype)
2272{
2273   unsigned int chan;
2274   struct tgsi_exec_vector dst;
2275
2276   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2277      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2278         union tgsi_exec_channel src[2];
2279
2280         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2281         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2282         op(&dst.xyzw[chan], &src[0], &src[1]);
2283      }
2284   }
2285   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2286      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2287         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2288      }
2289   }
2290}
2291
2292typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
2293                                  const union tgsi_exec_channel *src0,
2294                                  const union tgsi_exec_channel *src1,
2295                                  const union tgsi_exec_channel *src2);
2296
2297static void
2298exec_vector_trinary(struct tgsi_exec_machine *mach,
2299                    const struct tgsi_full_instruction *inst,
2300                    micro_trinary_op op,
2301                    enum tgsi_exec_datatype dst_datatype,
2302                    enum tgsi_exec_datatype src_datatype)
2303{
2304   unsigned int chan;
2305   struct tgsi_exec_vector dst;
2306
2307   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2308      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2309         union tgsi_exec_channel src[3];
2310
2311         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2312         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2313         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
2314         op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
2315      }
2316   }
2317   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2318      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2319         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2320      }
2321   }
2322}
2323
2324static void
2325exec_dp3(struct tgsi_exec_machine *mach,
2326         const struct tgsi_full_instruction *inst)
2327{
2328   unsigned int chan;
2329   union tgsi_exec_channel arg[3];
2330
2331   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2332   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2333   micro_mul(&arg[2], &arg[0], &arg[1]);
2334
2335   for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2336      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2337      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2338      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2339   }
2340
2341   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2342      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2343         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2344      }
2345   }
2346}
2347
2348static void
2349exec_dp4(struct tgsi_exec_machine *mach,
2350         const struct tgsi_full_instruction *inst)
2351{
2352   unsigned int chan;
2353   union tgsi_exec_channel arg[3];
2354
2355   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2356   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2357   micro_mul(&arg[2], &arg[0], &arg[1]);
2358
2359   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2360      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2361      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2362      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2363   }
2364
2365   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2366      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2367         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2368      }
2369   }
2370}
2371
2372static void
2373exec_dp2a(struct tgsi_exec_machine *mach,
2374          const struct tgsi_full_instruction *inst)
2375{
2376   unsigned int chan;
2377   union tgsi_exec_channel arg[3];
2378
2379   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2380   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2381   micro_mul(&arg[2], &arg[0], &arg[1]);
2382
2383   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2384   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2385   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2386
2387   fetch_source(mach, &arg[1], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2388   micro_add(&arg[0], &arg[0], &arg[1]);
2389
2390   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2391      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2392         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2393      }
2394   }
2395}
2396
2397static void
2398exec_dph(struct tgsi_exec_machine *mach,
2399         const struct tgsi_full_instruction *inst)
2400{
2401   unsigned int chan;
2402   union tgsi_exec_channel arg[3];
2403
2404   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2405   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2406   micro_mul(&arg[2], &arg[0], &arg[1]);
2407
2408   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2409   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2410   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2411
2412   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2413   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2414   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2415
2416   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2417   micro_add(&arg[0], &arg[0], &arg[1]);
2418
2419   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2420      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2421         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2422      }
2423   }
2424}
2425
2426static void
2427exec_dp2(struct tgsi_exec_machine *mach,
2428         const struct tgsi_full_instruction *inst)
2429{
2430   unsigned int chan;
2431   union tgsi_exec_channel arg[3];
2432
2433   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2434   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2435   micro_mul(&arg[2], &arg[0], &arg[1]);
2436
2437   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2438   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2439   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2440
2441   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2442      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2443         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2444      }
2445   }
2446}
2447
2448static void
2449exec_nrm4(struct tgsi_exec_machine *mach,
2450          const struct tgsi_full_instruction *inst)
2451{
2452   unsigned int chan;
2453   union tgsi_exec_channel arg[4];
2454   union tgsi_exec_channel scale;
2455
2456   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2457   micro_mul(&scale, &arg[0], &arg[0]);
2458
2459   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2460      union tgsi_exec_channel product;
2461
2462      fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2463      micro_mul(&product, &arg[chan], &arg[chan]);
2464      micro_add(&scale, &scale, &product);
2465   }
2466
2467   micro_rsq(&scale, &scale);
2468
2469   for (chan = CHAN_X; chan <= CHAN_W; chan++) {
2470      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2471         micro_mul(&arg[chan], &arg[chan], &scale);
2472         store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2473      }
2474   }
2475}
2476
2477static void
2478exec_nrm3(struct tgsi_exec_machine *mach,
2479          const struct tgsi_full_instruction *inst)
2480{
2481   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2482      unsigned int chan;
2483      union tgsi_exec_channel arg[3];
2484      union tgsi_exec_channel scale;
2485
2486      fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2487      micro_mul(&scale, &arg[0], &arg[0]);
2488
2489      for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2490         union tgsi_exec_channel product;
2491
2492         fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2493         micro_mul(&product, &arg[chan], &arg[chan]);
2494         micro_add(&scale, &scale, &product);
2495      }
2496
2497      micro_rsq(&scale, &scale);
2498
2499      for (chan = CHAN_X; chan <= CHAN_Z; chan++) {
2500         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2501            micro_mul(&arg[chan], &arg[chan], &scale);
2502            store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2503         }
2504      }
2505   }
2506
2507   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2508      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2509   }
2510}
2511
2512static void
2513exec_scs(struct tgsi_exec_machine *mach,
2514         const struct tgsi_full_instruction *inst)
2515{
2516   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) {
2517      union tgsi_exec_channel arg;
2518      union tgsi_exec_channel result;
2519
2520      fetch_source(mach, &arg, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2521
2522      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2523         micro_cos(&result, &arg);
2524         store_dest(mach, &result, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2525      }
2526      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2527         micro_sin(&result, &arg);
2528         store_dest(mach, &result, &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2529      }
2530   }
2531   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2532      store_dest(mach, &ZeroVec, &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2533   }
2534   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2535      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2536   }
2537}
2538
2539static void
2540exec_x2d(struct tgsi_exec_machine *mach,
2541         const struct tgsi_full_instruction *inst)
2542{
2543   union tgsi_exec_channel r[4];
2544   union tgsi_exec_channel d[2];
2545
2546   fetch_source(mach, &r[0], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2547   fetch_source(mach, &r[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2548   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XZ) {
2549      fetch_source(mach, &r[2], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2550      micro_mul(&r[2], &r[2], &r[0]);
2551      fetch_source(mach, &r[3], &inst->Src[2], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2552      micro_mul(&r[3], &r[3], &r[1]);
2553      micro_add(&r[2], &r[2], &r[3]);
2554      fetch_source(mach, &r[3], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2555      micro_add(&d[0], &r[2], &r[3]);
2556   }
2557   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YW) {
2558      fetch_source(mach, &r[2], &inst->Src[2], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2559      micro_mul(&r[2], &r[2], &r[0]);
2560      fetch_source(mach, &r[3], &inst->Src[2], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2561      micro_mul(&r[3], &r[3], &r[1]);
2562      micro_add(&r[2], &r[2], &r[3]);
2563      fetch_source(mach, &r[3], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2564      micro_add(&d[1], &r[2], &r[3]);
2565   }
2566   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2567      store_dest(mach, &d[0], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2568   }
2569   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2570      store_dest(mach, &d[1], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2571   }
2572   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2573      store_dest(mach, &d[0], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2574   }
2575   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2576      store_dest(mach, &d[1], &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2577   }
2578}
2579
2580static void
2581exec_rfl(struct tgsi_exec_machine *mach,
2582         const struct tgsi_full_instruction *inst)
2583{
2584   union tgsi_exec_channel r[9];
2585
2586   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2587      /* r0 = dp3(src0, src0) */
2588      fetch_source(mach, &r[2], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2589      micro_mul(&r[0], &r[2], &r[2]);
2590      fetch_source(mach, &r[4], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2591      micro_mul(&r[8], &r[4], &r[4]);
2592      micro_add(&r[0], &r[0], &r[8]);
2593      fetch_source(mach, &r[6], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2594      micro_mul(&r[8], &r[6], &r[6]);
2595      micro_add(&r[0], &r[0], &r[8]);
2596
2597      /* r1 = dp3(src0, src1) */
2598      fetch_source(mach, &r[3], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2599      micro_mul(&r[1], &r[2], &r[3]);
2600      fetch_source(mach, &r[5], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2601      micro_mul(&r[8], &r[4], &r[5]);
2602      micro_add(&r[1], &r[1], &r[8]);
2603      fetch_source(mach, &r[7], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2604      micro_mul(&r[8], &r[6], &r[7]);
2605      micro_add(&r[1], &r[1], &r[8]);
2606
2607      /* r1 = 2 * r1 / r0 */
2608      micro_add(&r[1], &r[1], &r[1]);
2609      micro_div(&r[1], &r[1], &r[0]);
2610
2611      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2612         micro_mul(&r[2], &r[2], &r[1]);
2613         micro_sub(&r[2], &r[2], &r[3]);
2614         store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2615      }
2616      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2617         micro_mul(&r[4], &r[4], &r[1]);
2618         micro_sub(&r[4], &r[4], &r[5]);
2619         store_dest(mach, &r[4], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2620      }
2621      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2622         micro_mul(&r[6], &r[6], &r[1]);
2623         micro_sub(&r[6], &r[6], &r[7]);
2624         store_dest(mach, &r[6], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2625      }
2626   }
2627   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2628      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2629   }
2630}
2631
2632static void
2633exec_xpd(struct tgsi_exec_machine *mach,
2634         const struct tgsi_full_instruction *inst)
2635{
2636   union tgsi_exec_channel r[6];
2637   union tgsi_exec_channel d[3];
2638
2639   fetch_source(mach, &r[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2640   fetch_source(mach, &r[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2641
2642   micro_mul(&r[2], &r[0], &r[1]);
2643
2644   fetch_source(mach, &r[3], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2645   fetch_source(mach, &r[4], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2646
2647   micro_mul(&r[5], &r[3], &r[4] );
2648   micro_sub(&d[CHAN_X], &r[2], &r[5]);
2649
2650   fetch_source(mach, &r[2], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2651
2652   micro_mul(&r[3], &r[3], &r[2]);
2653
2654   fetch_source(mach, &r[5], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2655
2656   micro_mul(&r[1], &r[1], &r[5]);
2657   micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2658
2659   micro_mul(&r[5], &r[5], &r[4]);
2660   micro_mul(&r[0], &r[0], &r[2]);
2661   micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2662
2663   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2664      store_dest(mach, &d[CHAN_X], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2665   }
2666   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2667      store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2668   }
2669   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2670      store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2671   }
2672   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2673      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2674   }
2675}
2676
2677static void
2678exec_dst(struct tgsi_exec_machine *mach,
2679         const struct tgsi_full_instruction *inst)
2680{
2681   union tgsi_exec_channel r[2];
2682   union tgsi_exec_channel d[4];
2683
2684   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2685      fetch_source(mach, &r[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2686      fetch_source(mach, &r[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2687      micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2688   }
2689   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2690      fetch_source(mach, &d[CHAN_Z], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2691   }
2692   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2693      fetch_source(mach, &d[CHAN_W], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2694   }
2695
2696   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2697      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2698   }
2699   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2700      store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2701   }
2702   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2703      store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2704   }
2705   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2706      store_dest(mach, &d[CHAN_W], &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2707   }
2708}
2709
2710static void
2711exec_log(struct tgsi_exec_machine *mach,
2712         const struct tgsi_full_instruction *inst)
2713{
2714   union tgsi_exec_channel r[3];
2715
2716   fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2717   micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
2718   micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
2719   micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
2720   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2721      store_dest(mach, &r[0], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2722   }
2723   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2724      micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
2725      micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
2726      store_dest(mach, &r[0], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2727   }
2728   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2729      store_dest(mach, &r[1], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2730   }
2731   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2732      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2733   }
2734}
2735
2736static void
2737exec_exp(struct tgsi_exec_machine *mach,
2738         const struct tgsi_full_instruction *inst)
2739{
2740   union tgsi_exec_channel r[3];
2741
2742   fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2743   micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
2744   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2745      micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
2746      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2747   }
2748   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2749      micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
2750      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2751   }
2752   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2753      micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
2754      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2755   }
2756   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2757      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2758   }
2759}
2760
2761static void
2762exec_lit(struct tgsi_exec_machine *mach,
2763         const struct tgsi_full_instruction *inst)
2764{
2765   union tgsi_exec_channel r[3];
2766   union tgsi_exec_channel d[3];
2767
2768   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2769      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2770   }
2771   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
2772      fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2773      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2774         micro_max(&d[CHAN_Y], &r[0], &ZeroVec);
2775         store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2776      }
2777
2778      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2779         fetch_source(mach, &r[1], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2780         micro_max(&r[1], &r[1], &ZeroVec);
2781
2782         fetch_source(mach, &r[2], &inst->Src[0], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2783         micro_min(&r[2], &r[2], &P128Vec);
2784         micro_max(&r[2], &r[2], &M128Vec);
2785         micro_pow(&r[1], &r[1], &r[2]);
2786         micro_lt(&d[CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
2787         store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2788      }
2789   }
2790   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2791      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2792   }
2793}
2794
2795static void
2796exec_break(struct tgsi_exec_machine *mach)
2797{
2798   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
2799      /* turn off loop channels for each enabled exec channel */
2800      mach->LoopMask &= ~mach->ExecMask;
2801      /* Todo: if mach->LoopMask == 0, jump to end of loop */
2802      UPDATE_EXEC_MASK(mach);
2803   } else {
2804      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
2805
2806      mach->Switch.mask = 0x0;
2807
2808      UPDATE_EXEC_MASK(mach);
2809   }
2810}
2811
2812static void
2813exec_switch(struct tgsi_exec_machine *mach,
2814            const struct tgsi_full_instruction *inst)
2815{
2816   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
2817   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
2818
2819   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
2820   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2821   mach->Switch.mask = 0x0;
2822   mach->Switch.defaultMask = 0x0;
2823
2824   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
2825   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
2826
2827   UPDATE_EXEC_MASK(mach);
2828}
2829
2830static void
2831exec_case(struct tgsi_exec_machine *mach,
2832          const struct tgsi_full_instruction *inst)
2833{
2834   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2835   union tgsi_exec_channel src;
2836   uint mask = 0;
2837
2838   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2839
2840   if (mach->Switch.selector.u[0] == src.u[0]) {
2841      mask |= 0x1;
2842   }
2843   if (mach->Switch.selector.u[1] == src.u[1]) {
2844      mask |= 0x2;
2845   }
2846   if (mach->Switch.selector.u[2] == src.u[2]) {
2847      mask |= 0x4;
2848   }
2849   if (mach->Switch.selector.u[3] == src.u[3]) {
2850      mask |= 0x8;
2851   }
2852
2853   mach->Switch.defaultMask |= mask;
2854
2855   mach->Switch.mask |= mask & prevMask;
2856
2857   UPDATE_EXEC_MASK(mach);
2858}
2859
2860static void
2861exec_default(struct tgsi_exec_machine *mach)
2862{
2863   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2864
2865   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
2866
2867   UPDATE_EXEC_MASK(mach);
2868}
2869
2870static void
2871exec_endswitch(struct tgsi_exec_machine *mach)
2872{
2873   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
2874   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
2875
2876   UPDATE_EXEC_MASK(mach);
2877}
2878
2879static void
2880micro_i2f(union tgsi_exec_channel *dst,
2881          const union tgsi_exec_channel *src)
2882{
2883   dst->f[0] = (float)src->i[0];
2884   dst->f[1] = (float)src->i[1];
2885   dst->f[2] = (float)src->i[2];
2886   dst->f[3] = (float)src->i[3];
2887}
2888
2889static void
2890micro_not(union tgsi_exec_channel *dst,
2891          const union tgsi_exec_channel *src)
2892{
2893   dst->u[0] = ~src->u[0];
2894   dst->u[1] = ~src->u[1];
2895   dst->u[2] = ~src->u[2];
2896   dst->u[3] = ~src->u[3];
2897}
2898
2899static void
2900micro_shl(union tgsi_exec_channel *dst,
2901          const union tgsi_exec_channel *src0,
2902          const union tgsi_exec_channel *src1)
2903{
2904   dst->u[0] = src0->u[0] << src1->u[0];
2905   dst->u[1] = src0->u[1] << src1->u[1];
2906   dst->u[2] = src0->u[2] << src1->u[2];
2907   dst->u[3] = src0->u[3] << src1->u[3];
2908}
2909
2910static void
2911micro_and(union tgsi_exec_channel *dst,
2912          const union tgsi_exec_channel *src0,
2913          const union tgsi_exec_channel *src1)
2914{
2915   dst->u[0] = src0->u[0] & src1->u[0];
2916   dst->u[1] = src0->u[1] & src1->u[1];
2917   dst->u[2] = src0->u[2] & src1->u[2];
2918   dst->u[3] = src0->u[3] & src1->u[3];
2919}
2920
2921static void
2922micro_or(union tgsi_exec_channel *dst,
2923         const union tgsi_exec_channel *src0,
2924         const union tgsi_exec_channel *src1)
2925{
2926   dst->u[0] = src0->u[0] | src1->u[0];
2927   dst->u[1] = src0->u[1] | src1->u[1];
2928   dst->u[2] = src0->u[2] | src1->u[2];
2929   dst->u[3] = src0->u[3] | src1->u[3];
2930}
2931
2932static void
2933micro_xor(union tgsi_exec_channel *dst,
2934          const union tgsi_exec_channel *src0,
2935          const union tgsi_exec_channel *src1)
2936{
2937   dst->u[0] = src0->u[0] ^ src1->u[0];
2938   dst->u[1] = src0->u[1] ^ src1->u[1];
2939   dst->u[2] = src0->u[2] ^ src1->u[2];
2940   dst->u[3] = src0->u[3] ^ src1->u[3];
2941}
2942
2943static void
2944micro_f2i(union tgsi_exec_channel *dst,
2945          const union tgsi_exec_channel *src)
2946{
2947   dst->i[0] = (int)src->f[0];
2948   dst->i[1] = (int)src->f[1];
2949   dst->i[2] = (int)src->f[2];
2950   dst->i[3] = (int)src->f[3];
2951}
2952
2953static void
2954micro_idiv(union tgsi_exec_channel *dst,
2955           const union tgsi_exec_channel *src0,
2956           const union tgsi_exec_channel *src1)
2957{
2958   dst->i[0] = src0->i[0] / src1->i[0];
2959   dst->i[1] = src0->i[1] / src1->i[1];
2960   dst->i[2] = src0->i[2] / src1->i[2];
2961   dst->i[3] = src0->i[3] / src1->i[3];
2962}
2963
2964static void
2965micro_imax(union tgsi_exec_channel *dst,
2966           const union tgsi_exec_channel *src0,
2967           const union tgsi_exec_channel *src1)
2968{
2969   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
2970   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
2971   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
2972   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
2973}
2974
2975static void
2976micro_imin(union tgsi_exec_channel *dst,
2977           const union tgsi_exec_channel *src0,
2978           const union tgsi_exec_channel *src1)
2979{
2980   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
2981   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
2982   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
2983   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
2984}
2985
2986static void
2987micro_isge(union tgsi_exec_channel *dst,
2988           const union tgsi_exec_channel *src0,
2989           const union tgsi_exec_channel *src1)
2990{
2991   dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
2992   dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
2993   dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
2994   dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
2995}
2996
2997static void
2998micro_ishr(union tgsi_exec_channel *dst,
2999           const union tgsi_exec_channel *src0,
3000           const union tgsi_exec_channel *src1)
3001{
3002   dst->i[0] = src0->i[0] >> src1->i[0];
3003   dst->i[1] = src0->i[1] >> src1->i[1];
3004   dst->i[2] = src0->i[2] >> src1->i[2];
3005   dst->i[3] = src0->i[3] >> src1->i[3];
3006}
3007
3008static void
3009micro_islt(union tgsi_exec_channel *dst,
3010           const union tgsi_exec_channel *src0,
3011           const union tgsi_exec_channel *src1)
3012{
3013   dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
3014   dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
3015   dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
3016   dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
3017}
3018
3019static void
3020micro_f2u(union tgsi_exec_channel *dst,
3021          const union tgsi_exec_channel *src)
3022{
3023   dst->u[0] = (uint)src->f[0];
3024   dst->u[1] = (uint)src->f[1];
3025   dst->u[2] = (uint)src->f[2];
3026   dst->u[3] = (uint)src->f[3];
3027}
3028
3029static void
3030micro_u2f(union tgsi_exec_channel *dst,
3031          const union tgsi_exec_channel *src)
3032{
3033   dst->f[0] = (float)src->u[0];
3034   dst->f[1] = (float)src->u[1];
3035   dst->f[2] = (float)src->u[2];
3036   dst->f[3] = (float)src->u[3];
3037}
3038
3039static void
3040micro_uadd(union tgsi_exec_channel *dst,
3041           const union tgsi_exec_channel *src0,
3042           const union tgsi_exec_channel *src1)
3043{
3044   dst->u[0] = src0->u[0] + src1->u[0];
3045   dst->u[1] = src0->u[1] + src1->u[1];
3046   dst->u[2] = src0->u[2] + src1->u[2];
3047   dst->u[3] = src0->u[3] + src1->u[3];
3048}
3049
3050static void
3051micro_udiv(union tgsi_exec_channel *dst,
3052           const union tgsi_exec_channel *src0,
3053           const union tgsi_exec_channel *src1)
3054{
3055   dst->u[0] = src0->u[0] / src1->u[0];
3056   dst->u[1] = src0->u[1] / src1->u[1];
3057   dst->u[2] = src0->u[2] / src1->u[2];
3058   dst->u[3] = src0->u[3] / src1->u[3];
3059}
3060
3061static void
3062micro_umad(union tgsi_exec_channel *dst,
3063           const union tgsi_exec_channel *src0,
3064           const union tgsi_exec_channel *src1,
3065           const union tgsi_exec_channel *src2)
3066{
3067   dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
3068   dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
3069   dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
3070   dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
3071}
3072
3073static void
3074micro_umax(union tgsi_exec_channel *dst,
3075           const union tgsi_exec_channel *src0,
3076           const union tgsi_exec_channel *src1)
3077{
3078   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
3079   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
3080   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
3081   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
3082}
3083
3084static void
3085micro_umin(union tgsi_exec_channel *dst,
3086           const union tgsi_exec_channel *src0,
3087           const union tgsi_exec_channel *src1)
3088{
3089   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
3090   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
3091   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
3092   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
3093}
3094
3095static void
3096micro_umod(union tgsi_exec_channel *dst,
3097           const union tgsi_exec_channel *src0,
3098           const union tgsi_exec_channel *src1)
3099{
3100   dst->u[0] = src0->u[0] % src1->u[0];
3101   dst->u[1] = src0->u[1] % src1->u[1];
3102   dst->u[2] = src0->u[2] % src1->u[2];
3103   dst->u[3] = src0->u[3] % src1->u[3];
3104}
3105
3106static void
3107micro_umul(union tgsi_exec_channel *dst,
3108           const union tgsi_exec_channel *src0,
3109           const union tgsi_exec_channel *src1)
3110{
3111   dst->u[0] = src0->u[0] * src1->u[0];
3112   dst->u[1] = src0->u[1] * src1->u[1];
3113   dst->u[2] = src0->u[2] * src1->u[2];
3114   dst->u[3] = src0->u[3] * src1->u[3];
3115}
3116
3117static void
3118micro_useq(union tgsi_exec_channel *dst,
3119           const union tgsi_exec_channel *src0,
3120           const union tgsi_exec_channel *src1)
3121{
3122   dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
3123   dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
3124   dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
3125   dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
3126}
3127
3128static void
3129micro_usge(union tgsi_exec_channel *dst,
3130           const union tgsi_exec_channel *src0,
3131           const union tgsi_exec_channel *src1)
3132{
3133   dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
3134   dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
3135   dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
3136   dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
3137}
3138
3139static void
3140micro_ushr(union tgsi_exec_channel *dst,
3141           const union tgsi_exec_channel *src0,
3142           const union tgsi_exec_channel *src1)
3143{
3144   dst->u[0] = src0->u[0] >> src1->u[0];
3145   dst->u[1] = src0->u[1] >> src1->u[1];
3146   dst->u[2] = src0->u[2] >> src1->u[2];
3147   dst->u[3] = src0->u[3] >> src1->u[3];
3148}
3149
3150static void
3151micro_uslt(union tgsi_exec_channel *dst,
3152           const union tgsi_exec_channel *src0,
3153           const union tgsi_exec_channel *src1)
3154{
3155   dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
3156   dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
3157   dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
3158   dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
3159}
3160
3161static void
3162micro_usne(union tgsi_exec_channel *dst,
3163           const union tgsi_exec_channel *src0,
3164           const union tgsi_exec_channel *src1)
3165{
3166   dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
3167   dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
3168   dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
3169   dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
3170}
3171
3172static void
3173exec_instruction(
3174   struct tgsi_exec_machine *mach,
3175   const struct tgsi_full_instruction *inst,
3176   int *pc )
3177{
3178   union tgsi_exec_channel r[10];
3179
3180   (*pc)++;
3181
3182   switch (inst->Instruction.Opcode) {
3183   case TGSI_OPCODE_ARL:
3184      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3185      break;
3186
3187   case TGSI_OPCODE_MOV:
3188      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3189      break;
3190
3191   case TGSI_OPCODE_LIT:
3192      exec_lit(mach, inst);
3193      break;
3194
3195   case TGSI_OPCODE_RCP:
3196      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3197      break;
3198
3199   case TGSI_OPCODE_RSQ:
3200      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3201      break;
3202
3203   case TGSI_OPCODE_EXP:
3204      exec_exp(mach, inst);
3205      break;
3206
3207   case TGSI_OPCODE_LOG:
3208      exec_log(mach, inst);
3209      break;
3210
3211   case TGSI_OPCODE_MUL:
3212      exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3213      break;
3214
3215   case TGSI_OPCODE_ADD:
3216      exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3217      break;
3218
3219   case TGSI_OPCODE_DP3:
3220      exec_dp3(mach, inst);
3221      break;
3222
3223   case TGSI_OPCODE_DP4:
3224      exec_dp4(mach, inst);
3225      break;
3226
3227   case TGSI_OPCODE_DST:
3228      exec_dst(mach, inst);
3229      break;
3230
3231   case TGSI_OPCODE_MIN:
3232      exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3233      break;
3234
3235   case TGSI_OPCODE_MAX:
3236      exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3237      break;
3238
3239   case TGSI_OPCODE_SLT:
3240      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3241      break;
3242
3243   case TGSI_OPCODE_SGE:
3244      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3245      break;
3246
3247   case TGSI_OPCODE_MAD:
3248      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3249      break;
3250
3251   case TGSI_OPCODE_SUB:
3252      exec_vector_binary(mach, inst, micro_sub, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3253      break;
3254
3255   case TGSI_OPCODE_LRP:
3256      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3257      break;
3258
3259   case TGSI_OPCODE_CND:
3260      exec_vector_trinary(mach, inst, micro_cnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3261      break;
3262
3263   case TGSI_OPCODE_DP2A:
3264      exec_dp2a(mach, inst);
3265      break;
3266
3267   case TGSI_OPCODE_FRC:
3268      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3269      break;
3270
3271   case TGSI_OPCODE_CLAMP:
3272      exec_vector_trinary(mach, inst, micro_clamp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3273      break;
3274
3275   case TGSI_OPCODE_FLR:
3276      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3277      break;
3278
3279   case TGSI_OPCODE_ROUND:
3280      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3281      break;
3282
3283   case TGSI_OPCODE_EX2:
3284      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3285      break;
3286
3287   case TGSI_OPCODE_LG2:
3288      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3289      break;
3290
3291   case TGSI_OPCODE_POW:
3292      exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3293      break;
3294
3295   case TGSI_OPCODE_XPD:
3296      exec_xpd(mach, inst);
3297      break;
3298
3299   case TGSI_OPCODE_ABS:
3300      exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3301      break;
3302
3303   case TGSI_OPCODE_RCC:
3304      exec_scalar_unary(mach, inst, micro_rcc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3305      break;
3306
3307   case TGSI_OPCODE_DPH:
3308      exec_dph(mach, inst);
3309      break;
3310
3311   case TGSI_OPCODE_COS:
3312      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3313      break;
3314
3315   case TGSI_OPCODE_DDX:
3316      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3317      break;
3318
3319   case TGSI_OPCODE_DDY:
3320      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3321      break;
3322
3323   case TGSI_OPCODE_KILP:
3324      exec_kilp (mach, inst);
3325      break;
3326
3327   case TGSI_OPCODE_KIL:
3328      exec_kil (mach, inst);
3329      break;
3330
3331   case TGSI_OPCODE_PK2H:
3332      assert (0);
3333      break;
3334
3335   case TGSI_OPCODE_PK2US:
3336      assert (0);
3337      break;
3338
3339   case TGSI_OPCODE_PK4B:
3340      assert (0);
3341      break;
3342
3343   case TGSI_OPCODE_PK4UB:
3344      assert (0);
3345      break;
3346
3347   case TGSI_OPCODE_RFL:
3348      exec_rfl(mach, inst);
3349      break;
3350
3351   case TGSI_OPCODE_SEQ:
3352      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3353      break;
3354
3355   case TGSI_OPCODE_SFL:
3356      exec_vector(mach, inst, micro_sfl, TGSI_EXEC_DATA_FLOAT);
3357      break;
3358
3359   case TGSI_OPCODE_SGT:
3360      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3361      break;
3362
3363   case TGSI_OPCODE_SIN:
3364      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3365      break;
3366
3367   case TGSI_OPCODE_SLE:
3368      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3369      break;
3370
3371   case TGSI_OPCODE_SNE:
3372      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3373      break;
3374
3375   case TGSI_OPCODE_STR:
3376      exec_vector(mach, inst, micro_str, TGSI_EXEC_DATA_FLOAT);
3377      break;
3378
3379   case TGSI_OPCODE_TEX:
3380      /* simple texture lookup */
3381      /* src[0] = texcoord */
3382      /* src[1] = sampler unit */
3383      exec_tex(mach, inst, TEX_MODIFIER_NONE);
3384      break;
3385
3386   case TGSI_OPCODE_TXB:
3387      /* Texture lookup with lod bias */
3388      /* src[0] = texcoord (src[0].w = LOD bias) */
3389      /* src[1] = sampler unit */
3390      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
3391      break;
3392
3393   case TGSI_OPCODE_TXD:
3394      /* Texture lookup with explict partial derivatives */
3395      /* src[0] = texcoord */
3396      /* src[1] = d[strq]/dx */
3397      /* src[2] = d[strq]/dy */
3398      /* src[3] = sampler unit */
3399      exec_txd(mach, inst);
3400      break;
3401
3402   case TGSI_OPCODE_TXL:
3403      /* Texture lookup with explit LOD */
3404      /* src[0] = texcoord (src[0].w = LOD) */
3405      /* src[1] = sampler unit */
3406      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
3407      break;
3408
3409   case TGSI_OPCODE_TXP:
3410      /* Texture lookup with projection */
3411      /* src[0] = texcoord (src[0].w = projection) */
3412      /* src[1] = sampler unit */
3413      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
3414      break;
3415
3416   case TGSI_OPCODE_UP2H:
3417      assert (0);
3418      break;
3419
3420   case TGSI_OPCODE_UP2US:
3421      assert (0);
3422      break;
3423
3424   case TGSI_OPCODE_UP4B:
3425      assert (0);
3426      break;
3427
3428   case TGSI_OPCODE_UP4UB:
3429      assert (0);
3430      break;
3431
3432   case TGSI_OPCODE_X2D:
3433      exec_x2d(mach, inst);
3434      break;
3435
3436   case TGSI_OPCODE_ARA:
3437      assert (0);
3438      break;
3439
3440   case TGSI_OPCODE_ARR:
3441      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3442      break;
3443
3444   case TGSI_OPCODE_BRA:
3445      assert (0);
3446      break;
3447
3448   case TGSI_OPCODE_CAL:
3449      /* skip the call if no execution channels are enabled */
3450      if (mach->ExecMask) {
3451         /* do the call */
3452
3453         /* First, record the depths of the execution stacks.
3454          * This is important for deeply nested/looped return statements.
3455          * We have to unwind the stacks by the correct amount.  For a
3456          * real code generator, we could determine the number of entries
3457          * to pop off each stack with simple static analysis and avoid
3458          * implementing this data structure at run time.
3459          */
3460         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
3461         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
3462         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
3463         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
3464         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
3465         /* note that PC was already incremented above */
3466         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
3467
3468         mach->CallStackTop++;
3469
3470         /* Second, push the Cond, Loop, Cont, Func stacks */
3471         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3472         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3473         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3474         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3475         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3476         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
3477
3478         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3479         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3480         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3481         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3482         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3483         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
3484
3485         /* Finally, jump to the subroutine */
3486         *pc = inst->Label.Label;
3487      }
3488      break;
3489
3490   case TGSI_OPCODE_RET:
3491      mach->FuncMask &= ~mach->ExecMask;
3492      UPDATE_EXEC_MASK(mach);
3493
3494      if (mach->FuncMask == 0x0) {
3495         /* really return now (otherwise, keep executing */
3496
3497         if (mach->CallStackTop == 0) {
3498            /* returning from main() */
3499            mach->CondStackTop = 0;
3500            mach->LoopStackTop = 0;
3501            *pc = -1;
3502            return;
3503         }
3504
3505         assert(mach->CallStackTop > 0);
3506         mach->CallStackTop--;
3507
3508         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3509         mach->CondMask = mach->CondStack[mach->CondStackTop];
3510
3511         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3512         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3513
3514         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3515         mach->ContMask = mach->ContStack[mach->ContStackTop];
3516
3517         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3518         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3519
3520         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3521         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3522
3523         assert(mach->FuncStackTop > 0);
3524         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3525
3526         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3527
3528         UPDATE_EXEC_MASK(mach);
3529      }
3530      break;
3531
3532   case TGSI_OPCODE_SSG:
3533      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3534      break;
3535
3536   case TGSI_OPCODE_CMP:
3537      exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3538      break;
3539
3540   case TGSI_OPCODE_SCS:
3541      exec_scs(mach, inst);
3542      break;
3543
3544   case TGSI_OPCODE_NRM:
3545      exec_nrm3(mach, inst);
3546      break;
3547
3548   case TGSI_OPCODE_NRM4:
3549      exec_nrm4(mach, inst);
3550      break;
3551
3552   case TGSI_OPCODE_DIV:
3553      exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3554      break;
3555
3556   case TGSI_OPCODE_DP2:
3557      exec_dp2(mach, inst);
3558      break;
3559
3560   case TGSI_OPCODE_IF:
3561      /* push CondMask */
3562      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3563      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3564      FETCH( &r[0], 0, CHAN_X );
3565      /* update CondMask */
3566      if( ! r[0].u[0] ) {
3567         mach->CondMask &= ~0x1;
3568      }
3569      if( ! r[0].u[1] ) {
3570         mach->CondMask &= ~0x2;
3571      }
3572      if( ! r[0].u[2] ) {
3573         mach->CondMask &= ~0x4;
3574      }
3575      if( ! r[0].u[3] ) {
3576         mach->CondMask &= ~0x8;
3577      }
3578      UPDATE_EXEC_MASK(mach);
3579      /* Todo: If CondMask==0, jump to ELSE */
3580      break;
3581
3582   case TGSI_OPCODE_ELSE:
3583      /* invert CondMask wrt previous mask */
3584      {
3585         uint prevMask;
3586         assert(mach->CondStackTop > 0);
3587         prevMask = mach->CondStack[mach->CondStackTop - 1];
3588         mach->CondMask = ~mach->CondMask & prevMask;
3589         UPDATE_EXEC_MASK(mach);
3590         /* Todo: If CondMask==0, jump to ENDIF */
3591      }
3592      break;
3593
3594   case TGSI_OPCODE_ENDIF:
3595      /* pop CondMask */
3596      assert(mach->CondStackTop > 0);
3597      mach->CondMask = mach->CondStack[--mach->CondStackTop];
3598      UPDATE_EXEC_MASK(mach);
3599      break;
3600
3601   case TGSI_OPCODE_END:
3602      /* make sure we end primitives which haven't
3603       * been explicitly emitted */
3604      conditional_emit_primitive(mach);
3605      /* halt execution */
3606      *pc = -1;
3607      break;
3608
3609   case TGSI_OPCODE_PUSHA:
3610      assert (0);
3611      break;
3612
3613   case TGSI_OPCODE_POPA:
3614      assert (0);
3615      break;
3616
3617   case TGSI_OPCODE_CEIL:
3618      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3619      break;
3620
3621   case TGSI_OPCODE_I2F:
3622      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
3623      break;
3624
3625   case TGSI_OPCODE_NOT:
3626      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3627      break;
3628
3629   case TGSI_OPCODE_TRUNC:
3630      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3631      break;
3632
3633   case TGSI_OPCODE_SHL:
3634      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3635      break;
3636
3637   case TGSI_OPCODE_AND:
3638      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3639      break;
3640
3641   case TGSI_OPCODE_OR:
3642      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3643      break;
3644
3645   case TGSI_OPCODE_MOD:
3646      assert (0);
3647      break;
3648
3649   case TGSI_OPCODE_XOR:
3650      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3651      break;
3652
3653   case TGSI_OPCODE_SAD:
3654      assert (0);
3655      break;
3656
3657   case TGSI_OPCODE_TXF:
3658      assert (0);
3659      break;
3660
3661   case TGSI_OPCODE_TXQ:
3662      assert (0);
3663      break;
3664
3665   case TGSI_OPCODE_EMIT:
3666      emit_vertex(mach);
3667      break;
3668
3669   case TGSI_OPCODE_ENDPRIM:
3670      emit_primitive(mach);
3671      break;
3672
3673   case TGSI_OPCODE_BGNLOOP:
3674      /* push LoopMask and ContMasks */
3675      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3676      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3677      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3678      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3679
3680      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3681      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3682      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3683      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3684      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
3685      break;
3686
3687   case TGSI_OPCODE_ENDLOOP:
3688      /* Restore ContMask, but don't pop */
3689      assert(mach->ContStackTop > 0);
3690      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3691      UPDATE_EXEC_MASK(mach);
3692      if (mach->ExecMask) {
3693         /* repeat loop: jump to instruction just past BGNLOOP */
3694         assert(mach->LoopLabelStackTop > 0);
3695         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3696      }
3697      else {
3698         /* exit loop: pop LoopMask */
3699         assert(mach->LoopStackTop > 0);
3700         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3701         /* pop ContMask */
3702         assert(mach->ContStackTop > 0);
3703         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3704         assert(mach->LoopLabelStackTop > 0);
3705         --mach->LoopLabelStackTop;
3706
3707         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3708      }
3709      UPDATE_EXEC_MASK(mach);
3710      break;
3711
3712   case TGSI_OPCODE_BRK:
3713      exec_break(mach);
3714      break;
3715
3716   case TGSI_OPCODE_CONT:
3717      /* turn off cont channels for each enabled exec channel */
3718      mach->ContMask &= ~mach->ExecMask;
3719      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3720      UPDATE_EXEC_MASK(mach);
3721      break;
3722
3723   case TGSI_OPCODE_BGNSUB:
3724      /* no-op */
3725      break;
3726
3727   case TGSI_OPCODE_ENDSUB:
3728      /*
3729       * XXX: This really should be a no-op. We should never reach this opcode.
3730       */
3731
3732      assert(mach->CallStackTop > 0);
3733      mach->CallStackTop--;
3734
3735      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3736      mach->CondMask = mach->CondStack[mach->CondStackTop];
3737
3738      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3739      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3740
3741      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3742      mach->ContMask = mach->ContStack[mach->ContStackTop];
3743
3744      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3745      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3746
3747      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3748      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3749
3750      assert(mach->FuncStackTop > 0);
3751      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3752
3753      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3754
3755      UPDATE_EXEC_MASK(mach);
3756      break;
3757
3758   case TGSI_OPCODE_NOP:
3759      break;
3760
3761   case TGSI_OPCODE_BREAKC:
3762      FETCH(&r[0], 0, CHAN_X);
3763      /* update CondMask */
3764      if (r[0].u[0] && (mach->ExecMask & 0x1)) {
3765         mach->LoopMask &= ~0x1;
3766      }
3767      if (r[0].u[1] && (mach->ExecMask & 0x2)) {
3768         mach->LoopMask &= ~0x2;
3769      }
3770      if (r[0].u[2] && (mach->ExecMask & 0x4)) {
3771         mach->LoopMask &= ~0x4;
3772      }
3773      if (r[0].u[3] && (mach->ExecMask & 0x8)) {
3774         mach->LoopMask &= ~0x8;
3775      }
3776      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3777      UPDATE_EXEC_MASK(mach);
3778      break;
3779
3780   case TGSI_OPCODE_F2I:
3781      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3782      break;
3783
3784   case TGSI_OPCODE_IDIV:
3785      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3786      break;
3787
3788   case TGSI_OPCODE_IMAX:
3789      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3790      break;
3791
3792   case TGSI_OPCODE_IMIN:
3793      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3794      break;
3795
3796   case TGSI_OPCODE_INEG:
3797      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3798      break;
3799
3800   case TGSI_OPCODE_ISGE:
3801      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3802      break;
3803
3804   case TGSI_OPCODE_ISHR:
3805      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3806      break;
3807
3808   case TGSI_OPCODE_ISLT:
3809      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3810      break;
3811
3812   case TGSI_OPCODE_F2U:
3813      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3814      break;
3815
3816   case TGSI_OPCODE_U2F:
3817      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
3818      break;
3819
3820   case TGSI_OPCODE_UADD:
3821      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3822      break;
3823
3824   case TGSI_OPCODE_UDIV:
3825      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3826      break;
3827
3828   case TGSI_OPCODE_UMAD:
3829      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3830      break;
3831
3832   case TGSI_OPCODE_UMAX:
3833      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3834      break;
3835
3836   case TGSI_OPCODE_UMIN:
3837      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3838      break;
3839
3840   case TGSI_OPCODE_UMOD:
3841      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3842      break;
3843
3844   case TGSI_OPCODE_UMUL:
3845      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3846      break;
3847
3848   case TGSI_OPCODE_USEQ:
3849      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3850      break;
3851
3852   case TGSI_OPCODE_USGE:
3853      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3854      break;
3855
3856   case TGSI_OPCODE_USHR:
3857      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3858      break;
3859
3860   case TGSI_OPCODE_USLT:
3861      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3862      break;
3863
3864   case TGSI_OPCODE_USNE:
3865      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3866      break;
3867
3868   case TGSI_OPCODE_SWITCH:
3869      exec_switch(mach, inst);
3870      break;
3871
3872   case TGSI_OPCODE_CASE:
3873      exec_case(mach, inst);
3874      break;
3875
3876   case TGSI_OPCODE_DEFAULT:
3877      exec_default(mach);
3878      break;
3879
3880   case TGSI_OPCODE_ENDSWITCH:
3881      exec_endswitch(mach);
3882      break;
3883
3884   case TGSI_OPCODE_LOAD:
3885      assert(0);
3886      break;
3887
3888   case TGSI_OPCODE_LOAD_MS:
3889      assert(0);
3890      break;
3891
3892   case TGSI_OPCODE_SAMPLE:
3893      exec_sample(mach, inst, TEX_MODIFIER_NONE);
3894      break;
3895
3896   case TGSI_OPCODE_SAMPLE_B:
3897      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
3898      break;
3899
3900   case TGSI_OPCODE_SAMPLE_C:
3901      exec_sample(mach, inst, TEX_MODIFIER_NONE);
3902      break;
3903
3904   case TGSI_OPCODE_SAMPLE_C_LZ:
3905      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
3906      break;
3907
3908   case TGSI_OPCODE_SAMPLE_D:
3909      exec_sample_d(mach, inst);
3910      break;
3911
3912   case TGSI_OPCODE_SAMPLE_L:
3913      exec_sample(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
3914      break;
3915
3916   case TGSI_OPCODE_GATHER4:
3917      assert(0);
3918      break;
3919
3920   case TGSI_OPCODE_RESINFO:
3921      assert(0);
3922      break;
3923
3924   case TGSI_OPCODE_SAMPLE_POS:
3925      assert(0);
3926      break;
3927
3928   case TGSI_OPCODE_SAMPLE_INFO:
3929      assert(0);
3930      break;
3931
3932   default:
3933      assert( 0 );
3934   }
3935}
3936
3937
3938#define DEBUG_EXECUTION 0
3939
3940
3941/**
3942 * Run TGSI interpreter.
3943 * \return bitmask of "alive" quad components
3944 */
3945uint
3946tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
3947{
3948   uint i;
3949   int pc = 0;
3950
3951   mach->CondMask = 0xf;
3952   mach->LoopMask = 0xf;
3953   mach->ContMask = 0xf;
3954   mach->FuncMask = 0xf;
3955   mach->ExecMask = 0xf;
3956
3957   mach->Switch.mask = 0xf;
3958
3959   assert(mach->CondStackTop == 0);
3960   assert(mach->LoopStackTop == 0);
3961   assert(mach->ContStackTop == 0);
3962   assert(mach->SwitchStackTop == 0);
3963   assert(mach->BreakStackTop == 0);
3964   assert(mach->CallStackTop == 0);
3965
3966   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
3967   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
3968
3969   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
3970      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
3971      mach->Primitives[0] = 0;
3972   }
3973
3974   /* execute declarations (interpolants) */
3975   for (i = 0; i < mach->NumDeclarations; i++) {
3976      exec_declaration( mach, mach->Declarations+i );
3977   }
3978
3979   {
3980#if DEBUG_EXECUTION
3981      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
3982      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
3983      uint inst = 1;
3984
3985      memcpy(temps, mach->Temps, sizeof(temps));
3986      memcpy(outputs, mach->Outputs, sizeof(outputs));
3987#endif
3988
3989      /* execute instructions, until pc is set to -1 */
3990      while (pc != -1) {
3991
3992#if DEBUG_EXECUTION
3993         uint i;
3994
3995         tgsi_dump_instruction(&mach->Instructions[pc], inst++);
3996#endif
3997
3998         assert(pc < (int) mach->NumInstructions);
3999         exec_instruction(mach, mach->Instructions + pc, &pc);
4000
4001#if DEBUG_EXECUTION
4002         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
4003            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
4004               uint j;
4005
4006               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
4007               debug_printf("TEMP[%2u] = ", i);
4008               for (j = 0; j < 4; j++) {
4009                  if (j > 0) {
4010                     debug_printf("           ");
4011                  }
4012                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
4013                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
4014                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
4015                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
4016                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
4017               }
4018            }
4019         }
4020         for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
4021            if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
4022               uint j;
4023
4024               memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
4025               debug_printf("OUT[%2u] =  ", i);
4026               for (j = 0; j < 4; j++) {
4027                  if (j > 0) {
4028                     debug_printf("           ");
4029                  }
4030                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
4031                               outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
4032                               outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
4033                               outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
4034                               outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
4035               }
4036            }
4037         }
4038#endif
4039      }
4040   }
4041
4042#if 0
4043   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
4044   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
4045      /*
4046       * Scale back depth component.
4047       */
4048      for (i = 0; i < 4; i++)
4049         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
4050   }
4051#endif
4052
4053   /* Strictly speaking, these assertions aren't really needed but they
4054    * can potentially catch some bugs in the control flow code.
4055    */
4056   assert(mach->CondStackTop == 0);
4057   assert(mach->LoopStackTop == 0);
4058   assert(mach->ContStackTop == 0);
4059   assert(mach->SwitchStackTop == 0);
4060   assert(mach->BreakStackTop == 0);
4061   assert(mach->CallStackTop == 0);
4062
4063   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4064}
4065