tgsi_exec.c revision 7f1b9ddd12e97ac57c4818646c17521bb0c2c358
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29/**
30 * TGSI interpreter/executor.
31 *
32 * Flow control information:
33 *
34 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
35 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
36 * care since a condition may be true for some quad components but false
37 * for other components.
38 *
39 * We basically execute all statements (even if they're in the part of
40 * an IF/ELSE clause that's "not taken") and use a special mask to
41 * control writing to destination registers.  This is the ExecMask.
42 * See store_dest().
43 *
44 * The ExecMask is computed from three other masks (CondMask, LoopMask and
45 * ContMask) which are controlled by the flow control instructions (namely:
46 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
47 *
48 *
49 * Authors:
50 *   Michal Krol
51 *   Brian Paul
52 */
53
54#include "pipe/p_compiler.h"
55#include "pipe/p_state.h"
56#include "pipe/p_shader_tokens.h"
57#include "tgsi/tgsi_dump.h"
58#include "tgsi/tgsi_parse.h"
59#include "tgsi/tgsi_util.h"
60#include "tgsi_exec.h"
61#include "util/u_memory.h"
62#include "util/u_math.h"
63
64
65#define FAST_MATH 1
66
67#define TILE_TOP_LEFT     0
68#define TILE_TOP_RIGHT    1
69#define TILE_BOTTOM_LEFT  2
70#define TILE_BOTTOM_RIGHT 3
71
72static void
73micro_abs(union tgsi_exec_channel *dst,
74          const union tgsi_exec_channel *src)
75{
76   dst->f[0] = fabsf(src->f[0]);
77   dst->f[1] = fabsf(src->f[1]);
78   dst->f[2] = fabsf(src->f[2]);
79   dst->f[3] = fabsf(src->f[3]);
80}
81
82static void
83micro_arl(union tgsi_exec_channel *dst,
84          const union tgsi_exec_channel *src)
85{
86   dst->i[0] = (int)floorf(src->f[0]);
87   dst->i[1] = (int)floorf(src->f[1]);
88   dst->i[2] = (int)floorf(src->f[2]);
89   dst->i[3] = (int)floorf(src->f[3]);
90}
91
92static void
93micro_arr(union tgsi_exec_channel *dst,
94          const union tgsi_exec_channel *src)
95{
96   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
97   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
98   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
99   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
100}
101
102static void
103micro_ceil(union tgsi_exec_channel *dst,
104           const union tgsi_exec_channel *src)
105{
106   dst->f[0] = ceilf(src->f[0]);
107   dst->f[1] = ceilf(src->f[1]);
108   dst->f[2] = ceilf(src->f[2]);
109   dst->f[3] = ceilf(src->f[3]);
110}
111
112static void
113micro_clamp(union tgsi_exec_channel *dst,
114            const union tgsi_exec_channel *src0,
115            const union tgsi_exec_channel *src1,
116            const union tgsi_exec_channel *src2)
117{
118   dst->f[0] = src0->f[0] < src1->f[0] ? src1->f[0] : src0->f[0] > src2->f[0] ? src2->f[0] : src0->f[0];
119   dst->f[1] = src0->f[1] < src1->f[1] ? src1->f[1] : src0->f[1] > src2->f[1] ? src2->f[1] : src0->f[1];
120   dst->f[2] = src0->f[2] < src1->f[2] ? src1->f[2] : src0->f[2] > src2->f[2] ? src2->f[2] : src0->f[2];
121   dst->f[3] = src0->f[3] < src1->f[3] ? src1->f[3] : src0->f[3] > src2->f[3] ? src2->f[3] : src0->f[3];
122}
123
124static void
125micro_cmp(union tgsi_exec_channel *dst,
126          const union tgsi_exec_channel *src0,
127          const union tgsi_exec_channel *src1,
128          const union tgsi_exec_channel *src2)
129{
130   dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
131   dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
132   dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
133   dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
134}
135
136static void
137micro_cnd(union tgsi_exec_channel *dst,
138          const union tgsi_exec_channel *src0,
139          const union tgsi_exec_channel *src1,
140          const union tgsi_exec_channel *src2)
141{
142   dst->f[0] = src2->f[0] > 0.5f ? src0->f[0] : src1->f[0];
143   dst->f[1] = src2->f[1] > 0.5f ? src0->f[1] : src1->f[1];
144   dst->f[2] = src2->f[2] > 0.5f ? src0->f[2] : src1->f[2];
145   dst->f[3] = src2->f[3] > 0.5f ? src0->f[3] : src1->f[3];
146}
147
148static void
149micro_cos(union tgsi_exec_channel *dst,
150          const union tgsi_exec_channel *src)
151{
152   dst->f[0] = cosf(src->f[0]);
153   dst->f[1] = cosf(src->f[1]);
154   dst->f[2] = cosf(src->f[2]);
155   dst->f[3] = cosf(src->f[3]);
156}
157
158static void
159micro_ddx(union tgsi_exec_channel *dst,
160          const union tgsi_exec_channel *src)
161{
162   dst->f[0] =
163   dst->f[1] =
164   dst->f[2] =
165   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
166}
167
168static void
169micro_ddy(union tgsi_exec_channel *dst,
170          const union tgsi_exec_channel *src)
171{
172   dst->f[0] =
173   dst->f[1] =
174   dst->f[2] =
175   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
176}
177
178static void
179micro_exp2(union tgsi_exec_channel *dst,
180           const union tgsi_exec_channel *src)
181{
182#if FAST_MATH
183   dst->f[0] = util_fast_exp2(src->f[0]);
184   dst->f[1] = util_fast_exp2(src->f[1]);
185   dst->f[2] = util_fast_exp2(src->f[2]);
186   dst->f[3] = util_fast_exp2(src->f[3]);
187#else
188#if DEBUG
189   /* Inf is okay for this instruction, so clamp it to silence assertions. */
190   uint i;
191   union tgsi_exec_channel clamped;
192
193   for (i = 0; i < 4; i++) {
194      if (src->f[i] > 127.99999f) {
195         clamped.f[i] = 127.99999f;
196      } else if (src->f[i] < -126.99999f) {
197         clamped.f[i] = -126.99999f;
198      } else {
199         clamped.f[i] = src->f[i];
200      }
201   }
202   src = &clamped;
203#endif /* DEBUG */
204
205   dst->f[0] = powf(2.0f, src->f[0]);
206   dst->f[1] = powf(2.0f, src->f[1]);
207   dst->f[2] = powf(2.0f, src->f[2]);
208   dst->f[3] = powf(2.0f, src->f[3]);
209#endif /* FAST_MATH */
210}
211
212static void
213micro_flr(union tgsi_exec_channel *dst,
214          const union tgsi_exec_channel *src)
215{
216   dst->f[0] = floorf(src->f[0]);
217   dst->f[1] = floorf(src->f[1]);
218   dst->f[2] = floorf(src->f[2]);
219   dst->f[3] = floorf(src->f[3]);
220}
221
222static void
223micro_frc(union tgsi_exec_channel *dst,
224          const union tgsi_exec_channel *src)
225{
226   dst->f[0] = src->f[0] - floorf(src->f[0]);
227   dst->f[1] = src->f[1] - floorf(src->f[1]);
228   dst->f[2] = src->f[2] - floorf(src->f[2]);
229   dst->f[3] = src->f[3] - floorf(src->f[3]);
230}
231
232static void
233micro_iabs(union tgsi_exec_channel *dst,
234           const union tgsi_exec_channel *src)
235{
236   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
237   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
238   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
239   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
240}
241
242static void
243micro_ineg(union tgsi_exec_channel *dst,
244           const union tgsi_exec_channel *src)
245{
246   dst->i[0] = -src->i[0];
247   dst->i[1] = -src->i[1];
248   dst->i[2] = -src->i[2];
249   dst->i[3] = -src->i[3];
250}
251
252static void
253micro_lg2(union tgsi_exec_channel *dst,
254          const union tgsi_exec_channel *src)
255{
256#if FAST_MATH
257   dst->f[0] = util_fast_log2(src->f[0]);
258   dst->f[1] = util_fast_log2(src->f[1]);
259   dst->f[2] = util_fast_log2(src->f[2]);
260   dst->f[3] = util_fast_log2(src->f[3]);
261#else
262   dst->f[0] = logf(src->f[0]) * 1.442695f;
263   dst->f[1] = logf(src->f[1]) * 1.442695f;
264   dst->f[2] = logf(src->f[2]) * 1.442695f;
265   dst->f[3] = logf(src->f[3]) * 1.442695f;
266#endif
267}
268
269static void
270micro_lrp(union tgsi_exec_channel *dst,
271          const union tgsi_exec_channel *src0,
272          const union tgsi_exec_channel *src1,
273          const union tgsi_exec_channel *src2)
274{
275   dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
276   dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
277   dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
278   dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
279}
280
281static void
282micro_mad(union tgsi_exec_channel *dst,
283          const union tgsi_exec_channel *src0,
284          const union tgsi_exec_channel *src1,
285          const union tgsi_exec_channel *src2)
286{
287   dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
288   dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
289   dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
290   dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
291}
292
293static void
294micro_mov(union tgsi_exec_channel *dst,
295          const union tgsi_exec_channel *src)
296{
297   dst->u[0] = src->u[0];
298   dst->u[1] = src->u[1];
299   dst->u[2] = src->u[2];
300   dst->u[3] = src->u[3];
301}
302
303static void
304micro_rcp(union tgsi_exec_channel *dst,
305          const union tgsi_exec_channel *src)
306{
307#if 0 /* for debugging */
308   assert(src->f[0] != 0.0f);
309   assert(src->f[1] != 0.0f);
310   assert(src->f[2] != 0.0f);
311   assert(src->f[3] != 0.0f);
312#endif
313   dst->f[0] = 1.0f / src->f[0];
314   dst->f[1] = 1.0f / src->f[1];
315   dst->f[2] = 1.0f / src->f[2];
316   dst->f[3] = 1.0f / src->f[3];
317}
318
319static void
320micro_rnd(union tgsi_exec_channel *dst,
321          const union tgsi_exec_channel *src)
322{
323   dst->f[0] = floorf(src->f[0] + 0.5f);
324   dst->f[1] = floorf(src->f[1] + 0.5f);
325   dst->f[2] = floorf(src->f[2] + 0.5f);
326   dst->f[3] = floorf(src->f[3] + 0.5f);
327}
328
329static void
330micro_rsq(union tgsi_exec_channel *dst,
331          const union tgsi_exec_channel *src)
332{
333#if 0 /* for debugging */
334   assert(src->f[0] != 0.0f);
335   assert(src->f[1] != 0.0f);
336   assert(src->f[2] != 0.0f);
337   assert(src->f[3] != 0.0f);
338#endif
339   dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
340   dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
341   dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
342   dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
343}
344
345static void
346micro_seq(union tgsi_exec_channel *dst,
347          const union tgsi_exec_channel *src0,
348          const union tgsi_exec_channel *src1)
349{
350   dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
351   dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
352   dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
353   dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
354}
355
356static void
357micro_sge(union tgsi_exec_channel *dst,
358          const union tgsi_exec_channel *src0,
359          const union tgsi_exec_channel *src1)
360{
361   dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
362   dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
363   dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
364   dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
365}
366
367static void
368micro_sgn(union tgsi_exec_channel *dst,
369          const union tgsi_exec_channel *src)
370{
371   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
372   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
373   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
374   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
375}
376
377static void
378micro_sgt(union tgsi_exec_channel *dst,
379          const union tgsi_exec_channel *src0,
380          const union tgsi_exec_channel *src1)
381{
382   dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
383   dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
384   dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
385   dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
386}
387
388static void
389micro_sin(union tgsi_exec_channel *dst,
390          const union tgsi_exec_channel *src)
391{
392   dst->f[0] = sinf(src->f[0]);
393   dst->f[1] = sinf(src->f[1]);
394   dst->f[2] = sinf(src->f[2]);
395   dst->f[3] = sinf(src->f[3]);
396}
397
398static void
399micro_sle(union tgsi_exec_channel *dst,
400          const union tgsi_exec_channel *src0,
401          const union tgsi_exec_channel *src1)
402{
403   dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
404   dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
405   dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
406   dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
407}
408
409static void
410micro_slt(union tgsi_exec_channel *dst,
411          const union tgsi_exec_channel *src0,
412          const union tgsi_exec_channel *src1)
413{
414   dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
415   dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
416   dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
417   dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
418}
419
420static void
421micro_sne(union tgsi_exec_channel *dst,
422          const union tgsi_exec_channel *src0,
423          const union tgsi_exec_channel *src1)
424{
425   dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
426   dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
427   dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
428   dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
429}
430
431static void
432micro_sfl(union tgsi_exec_channel *dst)
433{
434   dst->f[0] = 0.0f;
435   dst->f[1] = 0.0f;
436   dst->f[2] = 0.0f;
437   dst->f[3] = 0.0f;
438}
439
440static void
441micro_str(union tgsi_exec_channel *dst)
442{
443   dst->f[0] = 1.0f;
444   dst->f[1] = 1.0f;
445   dst->f[2] = 1.0f;
446   dst->f[3] = 1.0f;
447}
448
449static void
450micro_trunc(union tgsi_exec_channel *dst,
451            const union tgsi_exec_channel *src)
452{
453   dst->f[0] = (float)(int)src->f[0];
454   dst->f[1] = (float)(int)src->f[1];
455   dst->f[2] = (float)(int)src->f[2];
456   dst->f[3] = (float)(int)src->f[3];
457}
458
459
460#define CHAN_X  0
461#define CHAN_Y  1
462#define CHAN_Z  2
463#define CHAN_W  3
464
465enum tgsi_exec_datatype {
466   TGSI_EXEC_DATA_FLOAT,
467   TGSI_EXEC_DATA_INT,
468   TGSI_EXEC_DATA_UINT
469};
470
471/*
472 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
473 */
474#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
475#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
476#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
477#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
478#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
479#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
480
481
482/** The execution mask depends on the conditional mask and the loop mask */
483#define UPDATE_EXEC_MASK(MACH) \
484      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
485
486
487static const union tgsi_exec_channel ZeroVec =
488   { { 0.0, 0.0, 0.0, 0.0 } };
489
490static const union tgsi_exec_channel OneVec = {
491   {1.0f, 1.0f, 1.0f, 1.0f}
492};
493
494static const union tgsi_exec_channel P128Vec = {
495   {128.0f, 128.0f, 128.0f, 128.0f}
496};
497
498static const union tgsi_exec_channel M128Vec = {
499   {-128.0f, -128.0f, -128.0f, -128.0f}
500};
501
502
503/**
504 * Assert that none of the float values in 'chan' are infinite or NaN.
505 * NaN and Inf may occur normally during program execution and should
506 * not lead to crashes, etc.  But when debugging, it's helpful to catch
507 * them.
508 */
509static INLINE void
510check_inf_or_nan(const union tgsi_exec_channel *chan)
511{
512   assert(!util_is_inf_or_nan((chan)->f[0]));
513   assert(!util_is_inf_or_nan((chan)->f[1]));
514   assert(!util_is_inf_or_nan((chan)->f[2]));
515   assert(!util_is_inf_or_nan((chan)->f[3]));
516}
517
518
519#ifdef DEBUG
520static void
521print_chan(const char *msg, const union tgsi_exec_channel *chan)
522{
523   debug_printf("%s = {%f, %f, %f, %f}\n",
524                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
525}
526#endif
527
528
529#ifdef DEBUG
530static void
531print_temp(const struct tgsi_exec_machine *mach, uint index)
532{
533   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
534   int i;
535   debug_printf("Temp[%u] =\n", index);
536   for (i = 0; i < 4; i++) {
537      debug_printf("  %c: { %f, %f, %f, %f }\n",
538                   "XYZW"[i],
539                   tmp->xyzw[i].f[0],
540                   tmp->xyzw[i].f[1],
541                   tmp->xyzw[i].f[2],
542                   tmp->xyzw[i].f[3]);
543   }
544}
545#endif
546
547
548void
549tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
550                               unsigned num_bufs,
551                               const void **bufs,
552                               const unsigned *buf_sizes)
553{
554   unsigned i;
555
556   for (i = 0; i < num_bufs; i++) {
557      mach->Consts[i] = bufs[i];
558      mach->ConstsSize[i] = buf_sizes[i];
559   }
560}
561
562
563/**
564 * Check if there's a potential src/dst register data dependency when
565 * using SOA execution.
566 * Example:
567 *   MOV T, T.yxwz;
568 * This would expand into:
569 *   MOV t0, t1;
570 *   MOV t1, t0;
571 *   MOV t2, t3;
572 *   MOV t3, t2;
573 * The second instruction will have the wrong value for t0 if executed as-is.
574 */
575boolean
576tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
577{
578   uint i, chan;
579
580   uint writemask = inst->Dst[0].Register.WriteMask;
581   if (writemask == TGSI_WRITEMASK_X ||
582       writemask == TGSI_WRITEMASK_Y ||
583       writemask == TGSI_WRITEMASK_Z ||
584       writemask == TGSI_WRITEMASK_W ||
585       writemask == TGSI_WRITEMASK_NONE) {
586      /* no chance of data dependency */
587      return FALSE;
588   }
589
590   /* loop over src regs */
591   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
592      if ((inst->Src[i].Register.File ==
593           inst->Dst[0].Register.File) &&
594          ((inst->Src[i].Register.Index ==
595            inst->Dst[0].Register.Index) ||
596           inst->Src[i].Register.Indirect ||
597           inst->Dst[0].Register.Indirect)) {
598         /* loop over dest channels */
599         uint channelsWritten = 0x0;
600         for (chan = 0; chan < NUM_CHANNELS; chan++) {
601            if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
602               /* check if we're reading a channel that's been written */
603               uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
604               if (channelsWritten & (1 << swizzle)) {
605                  return TRUE;
606               }
607
608               channelsWritten |= (1 << chan);
609            }
610         }
611      }
612   }
613   return FALSE;
614}
615
616
617/**
618 * Initialize machine state by expanding tokens to full instructions,
619 * allocating temporary storage, setting up constants, etc.
620 * After this, we can call tgsi_exec_machine_run() many times.
621 */
622void
623tgsi_exec_machine_bind_shader(
624   struct tgsi_exec_machine *mach,
625   const struct tgsi_token *tokens,
626   uint numSamplers,
627   struct tgsi_sampler **samplers)
628{
629   uint k;
630   struct tgsi_parse_context parse;
631   struct tgsi_full_instruction *instructions;
632   struct tgsi_full_declaration *declarations;
633   uint maxInstructions = 10, numInstructions = 0;
634   uint maxDeclarations = 10, numDeclarations = 0;
635
636#if 0
637   tgsi_dump(tokens, 0);
638#endif
639
640   util_init_math();
641
642   if (numSamplers) {
643      assert(samplers);
644   }
645
646   mach->Tokens = tokens;
647   mach->Samplers = samplers;
648
649   if (!tokens) {
650      /* unbind and free all */
651      if (mach->Declarations) {
652         FREE( mach->Declarations );
653      }
654      mach->Declarations = NULL;
655      mach->NumDeclarations = 0;
656
657      if (mach->Instructions) {
658         FREE( mach->Instructions );
659      }
660      mach->Instructions = NULL;
661      mach->NumInstructions = 0;
662
663      return;
664   }
665
666   k = tgsi_parse_init (&parse, mach->Tokens);
667   if (k != TGSI_PARSE_OK) {
668      debug_printf( "Problem parsing!\n" );
669      return;
670   }
671
672   mach->Processor = parse.FullHeader.Processor.Processor;
673   mach->ImmLimit = 0;
674
675   if (mach->Processor == TGSI_PROCESSOR_GEOMETRY &&
676       !mach->UsedGeometryShader) {
677      struct tgsi_exec_vector *inputs =
678         align_malloc(sizeof(struct tgsi_exec_vector) *
679                      TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS,
680                      16);
681      struct tgsi_exec_vector *outputs =
682         align_malloc(sizeof(struct tgsi_exec_vector) *
683                      TGSI_MAX_TOTAL_VERTICES, 16);
684
685      if (!inputs)
686         return;
687      if (!outputs) {
688         align_free(inputs);
689         return;
690      }
691
692      align_free(mach->Inputs);
693      align_free(mach->Outputs);
694
695      mach->Inputs = inputs;
696      mach->Outputs = outputs;
697      mach->UsedGeometryShader = TRUE;
698   }
699
700   declarations = (struct tgsi_full_declaration *)
701      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
702
703   if (!declarations) {
704      return;
705   }
706
707   instructions = (struct tgsi_full_instruction *)
708      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
709
710   if (!instructions) {
711      FREE( declarations );
712      return;
713   }
714
715   while( !tgsi_parse_end_of_tokens( &parse ) ) {
716      uint i;
717
718      tgsi_parse_token( &parse );
719      switch( parse.FullToken.Token.Type ) {
720      case TGSI_TOKEN_TYPE_DECLARATION:
721         /* save expanded declaration */
722         if (numDeclarations == maxDeclarations) {
723            declarations = REALLOC(declarations,
724                                   maxDeclarations
725                                   * sizeof(struct tgsi_full_declaration),
726                                   (maxDeclarations + 10)
727                                   * sizeof(struct tgsi_full_declaration));
728            maxDeclarations += 10;
729         }
730         if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
731            unsigned reg;
732            for (reg = parse.FullToken.FullDeclaration.Range.First;
733                 reg <= parse.FullToken.FullDeclaration.Range.Last;
734                 ++reg) {
735               ++mach->NumOutputs;
736            }
737         }
738         if (parse.FullToken.FullDeclaration.Declaration.File ==
739             TGSI_FILE_IMMEDIATE_ARRAY) {
740            unsigned reg;
741            struct tgsi_full_declaration *decl =
742               &parse.FullToken.FullDeclaration;
743            debug_assert(decl->Range.Last < TGSI_EXEC_NUM_IMMEDIATES);
744            for (reg = decl->Range.First; reg <= decl->Range.Last; ++reg) {
745               for( i = 0; i < 4; i++ ) {
746                  int idx = reg * 4 + i;
747                  mach->ImmArray[reg][i] = decl->ImmediateData.u[idx].Float;
748               }
749            }
750         }
751         memcpy(declarations + numDeclarations,
752                &parse.FullToken.FullDeclaration,
753                sizeof(declarations[0]));
754         numDeclarations++;
755         break;
756
757      case TGSI_TOKEN_TYPE_IMMEDIATE:
758         {
759            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
760            assert( size <= 4 );
761            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
762
763            for( i = 0; i < size; i++ ) {
764               mach->Imms[mach->ImmLimit][i] =
765		  parse.FullToken.FullImmediate.u[i].Float;
766            }
767            mach->ImmLimit += 1;
768         }
769         break;
770
771      case TGSI_TOKEN_TYPE_INSTRUCTION:
772
773         /* save expanded instruction */
774         if (numInstructions == maxInstructions) {
775            instructions = REALLOC(instructions,
776                                   maxInstructions
777                                   * sizeof(struct tgsi_full_instruction),
778                                   (maxInstructions + 10)
779                                   * sizeof(struct tgsi_full_instruction));
780            maxInstructions += 10;
781         }
782
783         memcpy(instructions + numInstructions,
784                &parse.FullToken.FullInstruction,
785                sizeof(instructions[0]));
786
787         numInstructions++;
788         break;
789
790      case TGSI_TOKEN_TYPE_PROPERTY:
791         break;
792
793      default:
794         assert( 0 );
795      }
796   }
797   tgsi_parse_free (&parse);
798
799   if (mach->Declarations) {
800      FREE( mach->Declarations );
801   }
802   mach->Declarations = declarations;
803   mach->NumDeclarations = numDeclarations;
804
805   if (mach->Instructions) {
806      FREE( mach->Instructions );
807   }
808   mach->Instructions = instructions;
809   mach->NumInstructions = numInstructions;
810}
811
812
813struct tgsi_exec_machine *
814tgsi_exec_machine_create( void )
815{
816   struct tgsi_exec_machine *mach;
817   uint i;
818
819   mach = align_malloc( sizeof *mach, 16 );
820   if (!mach)
821      goto fail;
822
823   memset(mach, 0, sizeof(*mach));
824
825   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
826   mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
827   mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
828
829   mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_ATTRIBS, 16);
830   mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_ATTRIBS, 16);
831   if (!mach->Inputs || !mach->Outputs)
832      goto fail;
833
834   /* Setup constants needed by the SSE2 executor. */
835   for( i = 0; i < 4; i++ ) {
836      mach->Temps[TGSI_EXEC_TEMP_00000000_I].xyzw[TGSI_EXEC_TEMP_00000000_C].u[i] = 0x00000000;
837      mach->Temps[TGSI_EXEC_TEMP_7FFFFFFF_I].xyzw[TGSI_EXEC_TEMP_7FFFFFFF_C].u[i] = 0x7FFFFFFF;
838      mach->Temps[TGSI_EXEC_TEMP_80000000_I].xyzw[TGSI_EXEC_TEMP_80000000_C].u[i] = 0x80000000;
839      mach->Temps[TGSI_EXEC_TEMP_FFFFFFFF_I].xyzw[TGSI_EXEC_TEMP_FFFFFFFF_C].u[i] = 0xFFFFFFFF;    /* not used */
840      mach->Temps[TGSI_EXEC_TEMP_ONE_I].xyzw[TGSI_EXEC_TEMP_ONE_C].f[i] = 1.0f;
841      mach->Temps[TGSI_EXEC_TEMP_TWO_I].xyzw[TGSI_EXEC_TEMP_TWO_C].f[i] = 2.0f;    /* not used */
842      mach->Temps[TGSI_EXEC_TEMP_128_I].xyzw[TGSI_EXEC_TEMP_128_C].f[i] = 128.0f;
843      mach->Temps[TGSI_EXEC_TEMP_MINUS_128_I].xyzw[TGSI_EXEC_TEMP_MINUS_128_C].f[i] = -128.0f;
844      mach->Temps[TGSI_EXEC_TEMP_THREE_I].xyzw[TGSI_EXEC_TEMP_THREE_C].f[i] = 3.0f;
845      mach->Temps[TGSI_EXEC_TEMP_HALF_I].xyzw[TGSI_EXEC_TEMP_HALF_C].f[i] = 0.5f;
846   }
847
848#ifdef DEBUG
849   /* silence warnings */
850   (void) print_chan;
851   (void) print_temp;
852#endif
853
854   return mach;
855
856fail:
857   if (mach) {
858      align_free(mach->Inputs);
859      align_free(mach->Outputs);
860      align_free(mach);
861   }
862   return NULL;
863}
864
865
866void
867tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
868{
869   if (mach) {
870      if (mach->Instructions)
871         FREE(mach->Instructions);
872      if (mach->Declarations)
873         FREE(mach->Declarations);
874
875      align_free(mach->Inputs);
876      align_free(mach->Outputs);
877
878      align_free(mach);
879   }
880}
881
882static void
883micro_add(union tgsi_exec_channel *dst,
884          const union tgsi_exec_channel *src0,
885          const union tgsi_exec_channel *src1)
886{
887   dst->f[0] = src0->f[0] + src1->f[0];
888   dst->f[1] = src0->f[1] + src1->f[1];
889   dst->f[2] = src0->f[2] + src1->f[2];
890   dst->f[3] = src0->f[3] + src1->f[3];
891}
892
893static void
894micro_div(
895   union tgsi_exec_channel *dst,
896   const union tgsi_exec_channel *src0,
897   const union tgsi_exec_channel *src1 )
898{
899   if (src1->f[0] != 0) {
900      dst->f[0] = src0->f[0] / src1->f[0];
901   }
902   if (src1->f[1] != 0) {
903      dst->f[1] = src0->f[1] / src1->f[1];
904   }
905   if (src1->f[2] != 0) {
906      dst->f[2] = src0->f[2] / src1->f[2];
907   }
908   if (src1->f[3] != 0) {
909      dst->f[3] = src0->f[3] / src1->f[3];
910   }
911}
912
913static void
914micro_rcc(union tgsi_exec_channel *dst,
915          const union tgsi_exec_channel *src)
916{
917   uint i;
918
919   for (i = 0; i < 4; i++) {
920      float recip = 1.0f / src->f[i];
921
922      if (recip > 0.0f) {
923         if (recip > 1.884467e+019f) {
924            dst->f[i] = 1.884467e+019f;
925         }
926         else if (recip < 5.42101e-020f) {
927            dst->f[i] = 5.42101e-020f;
928         }
929         else {
930            dst->f[i] = recip;
931         }
932      }
933      else {
934         if (recip < -1.884467e+019f) {
935            dst->f[i] = -1.884467e+019f;
936         }
937         else if (recip > -5.42101e-020f) {
938            dst->f[i] = -5.42101e-020f;
939         }
940         else {
941            dst->f[i] = recip;
942         }
943      }
944   }
945}
946
947static void
948micro_lt(
949   union tgsi_exec_channel *dst,
950   const union tgsi_exec_channel *src0,
951   const union tgsi_exec_channel *src1,
952   const union tgsi_exec_channel *src2,
953   const union tgsi_exec_channel *src3 )
954{
955   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
956   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
957   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
958   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
959}
960
961static void
962micro_max(union tgsi_exec_channel *dst,
963          const union tgsi_exec_channel *src0,
964          const union tgsi_exec_channel *src1)
965{
966   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
967   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
968   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
969   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
970}
971
972static void
973micro_min(union tgsi_exec_channel *dst,
974          const union tgsi_exec_channel *src0,
975          const union tgsi_exec_channel *src1)
976{
977   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
978   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
979   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
980   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
981}
982
983static void
984micro_mul(union tgsi_exec_channel *dst,
985          const union tgsi_exec_channel *src0,
986          const union tgsi_exec_channel *src1)
987{
988   dst->f[0] = src0->f[0] * src1->f[0];
989   dst->f[1] = src0->f[1] * src1->f[1];
990   dst->f[2] = src0->f[2] * src1->f[2];
991   dst->f[3] = src0->f[3] * src1->f[3];
992}
993
994static void
995micro_neg(
996   union tgsi_exec_channel *dst,
997   const union tgsi_exec_channel *src )
998{
999   dst->f[0] = -src->f[0];
1000   dst->f[1] = -src->f[1];
1001   dst->f[2] = -src->f[2];
1002   dst->f[3] = -src->f[3];
1003}
1004
1005static void
1006micro_pow(
1007   union tgsi_exec_channel *dst,
1008   const union tgsi_exec_channel *src0,
1009   const union tgsi_exec_channel *src1 )
1010{
1011#if FAST_MATH
1012   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
1013   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
1014   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
1015   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
1016#else
1017   dst->f[0] = powf( src0->f[0], src1->f[0] );
1018   dst->f[1] = powf( src0->f[1], src1->f[1] );
1019   dst->f[2] = powf( src0->f[2], src1->f[2] );
1020   dst->f[3] = powf( src0->f[3], src1->f[3] );
1021#endif
1022}
1023
1024static void
1025micro_sub(union tgsi_exec_channel *dst,
1026          const union tgsi_exec_channel *src0,
1027          const union tgsi_exec_channel *src1)
1028{
1029   dst->f[0] = src0->f[0] - src1->f[0];
1030   dst->f[1] = src0->f[1] - src1->f[1];
1031   dst->f[2] = src0->f[2] - src1->f[2];
1032   dst->f[3] = src0->f[3] - src1->f[3];
1033}
1034
1035static void
1036fetch_src_file_channel(const struct tgsi_exec_machine *mach,
1037                       const uint file,
1038                       const uint swizzle,
1039                       const union tgsi_exec_channel *index,
1040                       const union tgsi_exec_channel *index2D,
1041                       union tgsi_exec_channel *chan)
1042{
1043   uint i;
1044
1045   assert(swizzle < 4);
1046
1047   switch (file) {
1048   case TGSI_FILE_CONSTANT:
1049      for (i = 0; i < QUAD_SIZE; i++) {
1050         assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
1051         assert(mach->Consts[index2D->i[i]]);
1052
1053         if (index->i[i] < 0) {
1054            chan->u[i] = 0;
1055         } else {
1056            /* NOTE: copying the const value as a uint instead of float */
1057            const uint constbuf = index2D->i[i];
1058            const uint *buf = (const uint *)mach->Consts[constbuf];
1059            const int pos = index->i[i] * 4 + swizzle;
1060            /* const buffer bounds check */
1061            if (pos < 0 || pos >= mach->ConstsSize[constbuf]) {
1062               if (0) {
1063                  /* Debug: print warning */
1064                  static int count = 0;
1065                  if (count++ < 100)
1066                     debug_printf("TGSI Exec: const buffer index %d"
1067                                  " out of bounds\n", pos);
1068               }
1069               chan->u[i] = 0;
1070            }
1071            else
1072               chan->u[i] = buf[pos];
1073         }
1074      }
1075      break;
1076
1077   case TGSI_FILE_INPUT:
1078      for (i = 0; i < QUAD_SIZE; i++) {
1079         /*
1080         if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1081            debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
1082                         index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
1083                         index2D->i[i], index->i[i]);
1084                         }*/
1085         int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
1086         assert(pos >= 0);
1087         assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
1088         chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
1089      }
1090      break;
1091
1092   case TGSI_FILE_SYSTEM_VALUE:
1093      /* XXX no swizzling at this point.  Will be needed if we put
1094       * gl_FragCoord, for example, in a sys value register.
1095       */
1096      for (i = 0; i < QUAD_SIZE; i++) {
1097         chan->f[i] = mach->SystemValue[index->i[i]][0];
1098      }
1099      break;
1100
1101   case TGSI_FILE_TEMPORARY:
1102      for (i = 0; i < QUAD_SIZE; i++) {
1103         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1104         assert(index2D->i[i] == 0);
1105
1106         chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1107      }
1108      break;
1109
1110   case TGSI_FILE_TEMPORARY_ARRAY:
1111      for (i = 0; i < QUAD_SIZE; i++) {
1112         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1113         assert(index2D->i[i] < TGSI_EXEC_NUM_TEMP_ARRAYS);
1114
1115         chan->u[i] =
1116            mach->TempArray[index2D->i[i]][index->i[i]].xyzw[swizzle].u[i];
1117      }
1118      break;
1119
1120   case TGSI_FILE_IMMEDIATE:
1121      for (i = 0; i < QUAD_SIZE; i++) {
1122         assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1123         assert(index2D->i[i] == 0);
1124
1125         chan->f[i] = mach->Imms[index->i[i]][swizzle];
1126      }
1127      break;
1128
1129   case TGSI_FILE_IMMEDIATE_ARRAY:
1130      for (i = 0; i < QUAD_SIZE; i++) {
1131         assert(index2D->i[i] == 0);
1132
1133         chan->f[i] = mach->ImmArray[index->i[i]][swizzle];
1134      }
1135      break;
1136
1137   case TGSI_FILE_ADDRESS:
1138      for (i = 0; i < QUAD_SIZE; i++) {
1139         assert(index->i[i] >= 0);
1140         assert(index2D->i[i] == 0);
1141
1142         chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1143      }
1144      break;
1145
1146   case TGSI_FILE_PREDICATE:
1147      for (i = 0; i < QUAD_SIZE; i++) {
1148         assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
1149         assert(index2D->i[i] == 0);
1150
1151         chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
1152      }
1153      break;
1154
1155   case TGSI_FILE_OUTPUT:
1156      /* vertex/fragment output vars can be read too */
1157      for (i = 0; i < QUAD_SIZE; i++) {
1158         assert(index->i[i] >= 0);
1159         assert(index2D->i[i] == 0);
1160
1161         chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1162      }
1163      break;
1164
1165   default:
1166      assert(0);
1167      for (i = 0; i < QUAD_SIZE; i++) {
1168         chan->u[i] = 0;
1169      }
1170   }
1171}
1172
1173static void
1174fetch_source(const struct tgsi_exec_machine *mach,
1175             union tgsi_exec_channel *chan,
1176             const struct tgsi_full_src_register *reg,
1177             const uint chan_index,
1178             enum tgsi_exec_datatype src_datatype)
1179{
1180   union tgsi_exec_channel index;
1181   union tgsi_exec_channel index2D;
1182   uint swizzle;
1183
1184   /* We start with a direct index into a register file.
1185    *
1186    *    file[1],
1187    *    where:
1188    *       file = Register.File
1189    *       [1] = Register.Index
1190    */
1191   index.i[0] =
1192   index.i[1] =
1193   index.i[2] =
1194   index.i[3] = reg->Register.Index;
1195
1196   /* There is an extra source register that indirectly subscripts
1197    * a register file. The direct index now becomes an offset
1198    * that is being added to the indirect register.
1199    *
1200    *    file[ind[2].x+1],
1201    *    where:
1202    *       ind = Indirect.File
1203    *       [2] = Indirect.Index
1204    *       .x = Indirect.SwizzleX
1205    */
1206   if (reg->Register.Indirect) {
1207      union tgsi_exec_channel index2;
1208      union tgsi_exec_channel indir_index;
1209      const uint execmask = mach->ExecMask;
1210      uint i;
1211
1212      /* which address register (always zero now) */
1213      index2.i[0] =
1214      index2.i[1] =
1215      index2.i[2] =
1216      index2.i[3] = reg->Indirect.Index;
1217      assert(reg->Indirect.File == TGSI_FILE_ADDRESS);
1218      /* get current value of address register[swizzle] */
1219      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1220      fetch_src_file_channel(mach,
1221                             reg->Indirect.File,
1222                             swizzle,
1223                             &index2,
1224                             &ZeroVec,
1225                             &indir_index);
1226
1227      /* add value of address register to the offset */
1228      index.i[0] += indir_index.i[0];
1229      index.i[1] += indir_index.i[1];
1230      index.i[2] += indir_index.i[2];
1231      index.i[3] += indir_index.i[3];
1232
1233      /* for disabled execution channels, zero-out the index to
1234       * avoid using a potential garbage value.
1235       */
1236      for (i = 0; i < QUAD_SIZE; i++) {
1237         if ((execmask & (1 << i)) == 0)
1238            index.i[i] = 0;
1239      }
1240   }
1241
1242   /* There is an extra source register that is a second
1243    * subscript to a register file. Effectively it means that
1244    * the register file is actually a 2D array of registers.
1245    *
1246    *    file[3][1],
1247    *    where:
1248    *       [3] = Dimension.Index
1249    */
1250   if (reg->Register.Dimension) {
1251      index2D.i[0] =
1252      index2D.i[1] =
1253      index2D.i[2] =
1254      index2D.i[3] = reg->Dimension.Index;
1255
1256      /* Again, the second subscript index can be addressed indirectly
1257       * identically to the first one.
1258       * Nothing stops us from indirectly addressing the indirect register,
1259       * but there is no need for that, so we won't exercise it.
1260       *
1261       *    file[ind[4].y+3][1],
1262       *    where:
1263       *       ind = DimIndirect.File
1264       *       [4] = DimIndirect.Index
1265       *       .y = DimIndirect.SwizzleX
1266       */
1267      if (reg->Dimension.Indirect) {
1268         union tgsi_exec_channel index2;
1269         union tgsi_exec_channel indir_index;
1270         const uint execmask = mach->ExecMask;
1271         uint i;
1272
1273         index2.i[0] =
1274         index2.i[1] =
1275         index2.i[2] =
1276         index2.i[3] = reg->DimIndirect.Index;
1277
1278         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1279         fetch_src_file_channel(mach,
1280                                reg->DimIndirect.File,
1281                                swizzle,
1282                                &index2,
1283                                &ZeroVec,
1284                                &indir_index);
1285
1286         index2D.i[0] += indir_index.i[0];
1287         index2D.i[1] += indir_index.i[1];
1288         index2D.i[2] += indir_index.i[2];
1289         index2D.i[3] += indir_index.i[3];
1290
1291         /* for disabled execution channels, zero-out the index to
1292          * avoid using a potential garbage value.
1293          */
1294         for (i = 0; i < QUAD_SIZE; i++) {
1295            if ((execmask & (1 << i)) == 0) {
1296               index2D.i[i] = 0;
1297            }
1298         }
1299      }
1300
1301      /* If by any chance there was a need for a 3D array of register
1302       * files, we would have to check whether Dimension is followed
1303       * by a dimension register and continue the saga.
1304       */
1305   } else {
1306      index2D.i[0] =
1307      index2D.i[1] =
1308      index2D.i[2] =
1309      index2D.i[3] = 0;
1310   }
1311
1312   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1313   fetch_src_file_channel(mach,
1314                          reg->Register.File,
1315                          swizzle,
1316                          &index,
1317                          &index2D,
1318                          chan);
1319
1320   if (reg->Register.Absolute) {
1321      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1322         micro_abs(chan, chan);
1323      } else {
1324         micro_iabs(chan, chan);
1325      }
1326   }
1327
1328   if (reg->Register.Negate) {
1329      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1330         micro_neg(chan, chan);
1331      } else {
1332         micro_ineg(chan, chan);
1333      }
1334   }
1335}
1336
1337static void
1338store_dest(struct tgsi_exec_machine *mach,
1339           const union tgsi_exec_channel *chan,
1340           const struct tgsi_full_dst_register *reg,
1341           const struct tgsi_full_instruction *inst,
1342           uint chan_index,
1343           enum tgsi_exec_datatype dst_datatype)
1344{
1345   uint i;
1346   union tgsi_exec_channel null;
1347   union tgsi_exec_channel *dst;
1348   union tgsi_exec_channel index2D;
1349   uint execmask = mach->ExecMask;
1350   int offset = 0;  /* indirection offset */
1351   int index;
1352
1353   /* for debugging */
1354   if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1355      check_inf_or_nan(chan);
1356   }
1357
1358   /* There is an extra source register that indirectly subscripts
1359    * a register file. The direct index now becomes an offset
1360    * that is being added to the indirect register.
1361    *
1362    *    file[ind[2].x+1],
1363    *    where:
1364    *       ind = Indirect.File
1365    *       [2] = Indirect.Index
1366    *       .x = Indirect.SwizzleX
1367    */
1368   if (reg->Register.Indirect) {
1369      union tgsi_exec_channel index;
1370      union tgsi_exec_channel indir_index;
1371      uint swizzle;
1372
1373      /* which address register (always zero for now) */
1374      index.i[0] =
1375      index.i[1] =
1376      index.i[2] =
1377      index.i[3] = reg->Indirect.Index;
1378
1379      /* get current value of address register[swizzle] */
1380      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1381
1382      /* fetch values from the address/indirection register */
1383      fetch_src_file_channel(mach,
1384                             reg->Indirect.File,
1385                             swizzle,
1386                             &index,
1387                             &ZeroVec,
1388                             &indir_index);
1389
1390      /* save indirection offset */
1391      offset = indir_index.i[0];
1392   }
1393
1394   /* There is an extra source register that is a second
1395    * subscript to a register file. Effectively it means that
1396    * the register file is actually a 2D array of registers.
1397    *
1398    *    file[3][1],
1399    *    where:
1400    *       [3] = Dimension.Index
1401    */
1402   if (reg->Register.Dimension) {
1403      index2D.i[0] =
1404      index2D.i[1] =
1405      index2D.i[2] =
1406      index2D.i[3] = reg->Dimension.Index;
1407
1408      /* Again, the second subscript index can be addressed indirectly
1409       * identically to the first one.
1410       * Nothing stops us from indirectly addressing the indirect register,
1411       * but there is no need for that, so we won't exercise it.
1412       *
1413       *    file[ind[4].y+3][1],
1414       *    where:
1415       *       ind = DimIndirect.File
1416       *       [4] = DimIndirect.Index
1417       *       .y = DimIndirect.SwizzleX
1418       */
1419      if (reg->Dimension.Indirect) {
1420         union tgsi_exec_channel index2;
1421         union tgsi_exec_channel indir_index;
1422         const uint execmask = mach->ExecMask;
1423         unsigned swizzle;
1424         uint i;
1425
1426         index2.i[0] =
1427         index2.i[1] =
1428         index2.i[2] =
1429         index2.i[3] = reg->DimIndirect.Index;
1430
1431         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1432         fetch_src_file_channel(mach,
1433                                reg->DimIndirect.File,
1434                                swizzle,
1435                                &index2,
1436                                &ZeroVec,
1437                                &indir_index);
1438
1439         index2D.i[0] += indir_index.i[0];
1440         index2D.i[1] += indir_index.i[1];
1441         index2D.i[2] += indir_index.i[2];
1442         index2D.i[3] += indir_index.i[3];
1443
1444         /* for disabled execution channels, zero-out the index to
1445          * avoid using a potential garbage value.
1446          */
1447         for (i = 0; i < QUAD_SIZE; i++) {
1448            if ((execmask & (1 << i)) == 0) {
1449               index2D.i[i] = 0;
1450            }
1451         }
1452      }
1453
1454      /* If by any chance there was a need for a 3D array of register
1455       * files, we would have to check whether Dimension is followed
1456       * by a dimension register and continue the saga.
1457       */
1458   } else {
1459      index2D.i[0] =
1460      index2D.i[1] =
1461      index2D.i[2] =
1462      index2D.i[3] = 0;
1463   }
1464
1465   switch (reg->Register.File) {
1466   case TGSI_FILE_NULL:
1467      dst = &null;
1468      break;
1469
1470   case TGSI_FILE_OUTPUT:
1471      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1472         + reg->Register.Index;
1473      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1474#if 0
1475      if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1476         fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1477         for (i = 0; i < QUAD_SIZE; i++)
1478            if (execmask & (1 << i))
1479               fprintf(stderr, "%f, ", chan->f[i]);
1480         fprintf(stderr, ")\n");
1481      }
1482#endif
1483      break;
1484
1485   case TGSI_FILE_TEMPORARY:
1486      index = reg->Register.Index;
1487      assert( index < TGSI_EXEC_NUM_TEMPS );
1488      dst = &mach->Temps[offset + index].xyzw[chan_index];
1489      break;
1490
1491   case TGSI_FILE_TEMPORARY_ARRAY:
1492      index = reg->Register.Index;
1493      assert( index < TGSI_EXEC_NUM_TEMPS );
1494      assert( index2D.i[0] < TGSI_EXEC_NUM_TEMP_ARRAYS );
1495      /* XXX we use index2D.i[0] here but somehow we might
1496       * end up with someone trying to store indirectly in
1497       * different buffers */
1498      dst = &mach->TempArray[index2D.i[0]][offset + index].xyzw[chan_index];
1499      break;
1500
1501   case TGSI_FILE_ADDRESS:
1502      index = reg->Register.Index;
1503      dst = &mach->Addrs[index].xyzw[chan_index];
1504      break;
1505
1506   case TGSI_FILE_PREDICATE:
1507      index = reg->Register.Index;
1508      assert(index < TGSI_EXEC_NUM_PREDS);
1509      dst = &mach->Predicates[index].xyzw[chan_index];
1510      break;
1511
1512   default:
1513      assert( 0 );
1514      return;
1515   }
1516
1517   if (inst->Instruction.Predicate) {
1518      uint swizzle;
1519      union tgsi_exec_channel *pred;
1520
1521      switch (chan_index) {
1522      case CHAN_X:
1523         swizzle = inst->Predicate.SwizzleX;
1524         break;
1525      case CHAN_Y:
1526         swizzle = inst->Predicate.SwizzleY;
1527         break;
1528      case CHAN_Z:
1529         swizzle = inst->Predicate.SwizzleZ;
1530         break;
1531      case CHAN_W:
1532         swizzle = inst->Predicate.SwizzleW;
1533         break;
1534      default:
1535         assert(0);
1536         return;
1537      }
1538
1539      assert(inst->Predicate.Index == 0);
1540
1541      pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1542
1543      if (inst->Predicate.Negate) {
1544         for (i = 0; i < QUAD_SIZE; i++) {
1545            if (pred->u[i]) {
1546               execmask &= ~(1 << i);
1547            }
1548         }
1549      } else {
1550         for (i = 0; i < QUAD_SIZE; i++) {
1551            if (!pred->u[i]) {
1552               execmask &= ~(1 << i);
1553            }
1554         }
1555      }
1556   }
1557
1558   switch (inst->Instruction.Saturate) {
1559   case TGSI_SAT_NONE:
1560      for (i = 0; i < QUAD_SIZE; i++)
1561         if (execmask & (1 << i))
1562            dst->i[i] = chan->i[i];
1563      break;
1564
1565   case TGSI_SAT_ZERO_ONE:
1566      for (i = 0; i < QUAD_SIZE; i++)
1567         if (execmask & (1 << i)) {
1568            if (chan->f[i] < 0.0f)
1569               dst->f[i] = 0.0f;
1570            else if (chan->f[i] > 1.0f)
1571               dst->f[i] = 1.0f;
1572            else
1573               dst->i[i] = chan->i[i];
1574         }
1575      break;
1576
1577   case TGSI_SAT_MINUS_PLUS_ONE:
1578      for (i = 0; i < QUAD_SIZE; i++)
1579         if (execmask & (1 << i)) {
1580            if (chan->f[i] < -1.0f)
1581               dst->f[i] = -1.0f;
1582            else if (chan->f[i] > 1.0f)
1583               dst->f[i] = 1.0f;
1584            else
1585               dst->i[i] = chan->i[i];
1586         }
1587      break;
1588
1589   default:
1590      assert( 0 );
1591   }
1592}
1593
1594#define FETCH(VAL,INDEX,CHAN)\
1595    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1596
1597
1598/**
1599 * Execute ARB-style KIL which is predicated by a src register.
1600 * Kill fragment if any of the four values is less than zero.
1601 */
1602static void
1603exec_kil(struct tgsi_exec_machine *mach,
1604         const struct tgsi_full_instruction *inst)
1605{
1606   uint uniquemask;
1607   uint chan_index;
1608   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1609   union tgsi_exec_channel r[1];
1610
1611   /* This mask stores component bits that were already tested. */
1612   uniquemask = 0;
1613
1614   for (chan_index = 0; chan_index < 4; chan_index++)
1615   {
1616      uint swizzle;
1617      uint i;
1618
1619      /* unswizzle channel */
1620      swizzle = tgsi_util_get_full_src_register_swizzle (
1621                        &inst->Src[0],
1622                        chan_index);
1623
1624      /* check if the component has not been already tested */
1625      if (uniquemask & (1 << swizzle))
1626         continue;
1627      uniquemask |= 1 << swizzle;
1628
1629      FETCH(&r[0], 0, chan_index);
1630      for (i = 0; i < 4; i++)
1631         if (r[0].f[i] < 0.0f)
1632            kilmask |= 1 << i;
1633   }
1634
1635   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1636}
1637
1638/**
1639 * Execute NVIDIA-style KIL which is predicated by a condition code.
1640 * Kill fragment if the condition code is TRUE.
1641 */
1642static void
1643exec_kilp(struct tgsi_exec_machine *mach,
1644          const struct tgsi_full_instruction *inst)
1645{
1646   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1647
1648   /* "unconditional" kil */
1649   kilmask = mach->ExecMask;
1650   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1651}
1652
1653static void
1654emit_vertex(struct tgsi_exec_machine *mach)
1655{
1656   /* FIXME: check for exec mask correctly
1657   unsigned i;
1658   for (i = 0; i < QUAD_SIZE; ++i) {
1659         if ((mach->ExecMask & (1 << i)))
1660   */
1661   if (mach->ExecMask) {
1662      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1663      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1664   }
1665}
1666
1667static void
1668emit_primitive(struct tgsi_exec_machine *mach)
1669{
1670   unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1671   /* FIXME: check for exec mask correctly
1672   unsigned i;
1673   for (i = 0; i < QUAD_SIZE; ++i) {
1674         if ((mach->ExecMask & (1 << i)))
1675   */
1676   if (mach->ExecMask) {
1677      ++(*prim_count);
1678      debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1679      mach->Primitives[*prim_count] = 0;
1680   }
1681}
1682
1683static void
1684conditional_emit_primitive(struct tgsi_exec_machine *mach)
1685{
1686   if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1687      int emitted_verts =
1688         mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]];
1689      if (emitted_verts) {
1690         emit_primitive(mach);
1691      }
1692   }
1693}
1694
1695
1696/*
1697 * Fetch four texture samples using STR texture coordinates.
1698 */
1699static void
1700fetch_texel( struct tgsi_sampler *sampler,
1701             const union tgsi_exec_channel *s,
1702             const union tgsi_exec_channel *t,
1703             const union tgsi_exec_channel *p,
1704             const union tgsi_exec_channel *c0,
1705             enum tgsi_sampler_control control,
1706             union tgsi_exec_channel *r,
1707             union tgsi_exec_channel *g,
1708             union tgsi_exec_channel *b,
1709             union tgsi_exec_channel *a )
1710{
1711   uint j;
1712   float rgba[NUM_CHANNELS][QUAD_SIZE];
1713
1714   sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
1715
1716   for (j = 0; j < 4; j++) {
1717      r->f[j] = rgba[0][j];
1718      g->f[j] = rgba[1][j];
1719      b->f[j] = rgba[2][j];
1720      a->f[j] = rgba[3][j];
1721   }
1722}
1723
1724
1725#define TEX_MODIFIER_NONE           0
1726#define TEX_MODIFIER_PROJECTED      1
1727#define TEX_MODIFIER_LOD_BIAS       2
1728#define TEX_MODIFIER_EXPLICIT_LOD   3
1729
1730
1731static void
1732exec_tex(struct tgsi_exec_machine *mach,
1733         const struct tgsi_full_instruction *inst,
1734         uint modifier)
1735{
1736   const uint unit = inst->Src[1].Register.Index;
1737   union tgsi_exec_channel r[4];
1738   const union tgsi_exec_channel *lod = &ZeroVec;
1739   enum tgsi_sampler_control control;
1740   uint chan;
1741
1742   if (modifier != TEX_MODIFIER_NONE) {
1743      FETCH(&r[3], 0, CHAN_W);
1744      if (modifier != TEX_MODIFIER_PROJECTED) {
1745         lod = &r[3];
1746      }
1747   }
1748
1749   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
1750      control = tgsi_sampler_lod_explicit;
1751   } else {
1752      control = tgsi_sampler_lod_bias;
1753   }
1754
1755   switch (inst->Texture.Texture) {
1756   case TGSI_TEXTURE_1D:
1757   case TGSI_TEXTURE_SHADOW1D:
1758      FETCH(&r[0], 0, CHAN_X);
1759
1760      if (modifier == TEX_MODIFIER_PROJECTED) {
1761         micro_div(&r[0], &r[0], &r[3]);
1762      }
1763
1764      fetch_texel(mach->Samplers[unit],
1765                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
1766                  control,
1767                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1768      break;
1769
1770   case TGSI_TEXTURE_2D:
1771   case TGSI_TEXTURE_RECT:
1772   case TGSI_TEXTURE_SHADOW2D:
1773   case TGSI_TEXTURE_SHADOWRECT:
1774      FETCH(&r[0], 0, CHAN_X);
1775      FETCH(&r[1], 0, CHAN_Y);
1776      FETCH(&r[2], 0, CHAN_Z);
1777
1778      if (modifier == TEX_MODIFIER_PROJECTED) {
1779         micro_div(&r[0], &r[0], &r[3]);
1780         micro_div(&r[1], &r[1], &r[3]);
1781         micro_div(&r[2], &r[2], &r[3]);
1782      }
1783
1784      fetch_texel(mach->Samplers[unit],
1785                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1786                  control,
1787                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1788      break;
1789
1790   case TGSI_TEXTURE_1D_ARRAY:
1791      FETCH(&r[0], 0, CHAN_X);
1792      FETCH(&r[1], 0, CHAN_Y);
1793
1794      if (modifier == TEX_MODIFIER_PROJECTED) {
1795         micro_div(&r[0], &r[0], &r[3]);
1796      }
1797
1798      fetch_texel(mach->Samplers[unit],
1799                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1800                  control,
1801                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1802      break;
1803
1804   case TGSI_TEXTURE_2D_ARRAY:
1805      FETCH(&r[0], 0, CHAN_X);
1806      FETCH(&r[1], 0, CHAN_Y);
1807      FETCH(&r[2], 0, CHAN_Z);
1808
1809      if (modifier == TEX_MODIFIER_PROJECTED) {
1810         micro_div(&r[0], &r[0], &r[3]);
1811         micro_div(&r[1], &r[1], &r[3]);
1812      }
1813
1814      fetch_texel(mach->Samplers[unit],
1815                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1816                  control,
1817                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1818      break;
1819
1820   case TGSI_TEXTURE_3D:
1821   case TGSI_TEXTURE_CUBE:
1822      FETCH(&r[0], 0, CHAN_X);
1823      FETCH(&r[1], 0, CHAN_Y);
1824      FETCH(&r[2], 0, CHAN_Z);
1825
1826      if (modifier == TEX_MODIFIER_PROJECTED) {
1827         micro_div(&r[0], &r[0], &r[3]);
1828         micro_div(&r[1], &r[1], &r[3]);
1829         micro_div(&r[2], &r[2], &r[3]);
1830      }
1831
1832      fetch_texel(mach->Samplers[unit],
1833                  &r[0], &r[1], &r[2], lod,
1834                  control,
1835                  &r[0], &r[1], &r[2], &r[3]);
1836      break;
1837
1838   default:
1839      assert(0);
1840   }
1841
1842#if 0
1843   debug_printf("fetch r: %g %g %g %g\n",
1844         r[0].f[0], r[0].f[1], r[0].f[2], r[0].f[3]);
1845   debug_printf("fetch g: %g %g %g %g\n",
1846         r[1].f[0], r[1].f[1], r[1].f[2], r[1].f[3]);
1847   debug_printf("fetch b: %g %g %g %g\n",
1848         r[2].f[0], r[2].f[1], r[2].f[2], r[2].f[3]);
1849   debug_printf("fetch a: %g %g %g %g\n",
1850         r[3].f[0], r[3].f[1], r[3].f[2], r[3].f[3]);
1851#endif
1852
1853   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1854      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1855         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1856      }
1857   }
1858}
1859
1860static void
1861exec_txd(struct tgsi_exec_machine *mach,
1862         const struct tgsi_full_instruction *inst)
1863{
1864   const uint unit = inst->Src[3].Register.Index;
1865   union tgsi_exec_channel r[4];
1866   uint chan;
1867
1868   /*
1869    * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1870    */
1871
1872   switch (inst->Texture.Texture) {
1873   case TGSI_TEXTURE_1D:
1874   case TGSI_TEXTURE_SHADOW1D:
1875
1876      FETCH(&r[0], 0, CHAN_X);
1877
1878      fetch_texel(mach->Samplers[unit],
1879                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
1880                  tgsi_sampler_lod_bias,
1881                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
1882      break;
1883
1884   case TGSI_TEXTURE_2D:
1885   case TGSI_TEXTURE_RECT:
1886   case TGSI_TEXTURE_SHADOW2D:
1887   case TGSI_TEXTURE_SHADOWRECT:
1888
1889      FETCH(&r[0], 0, CHAN_X);
1890      FETCH(&r[1], 0, CHAN_Y);
1891      FETCH(&r[2], 0, CHAN_Z);
1892
1893      fetch_texel(mach->Samplers[unit],
1894                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
1895                  tgsi_sampler_lod_bias,
1896                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
1897      break;
1898
1899   case TGSI_TEXTURE_3D:
1900   case TGSI_TEXTURE_CUBE:
1901
1902      FETCH(&r[0], 0, CHAN_X);
1903      FETCH(&r[1], 0, CHAN_Y);
1904      FETCH(&r[2], 0, CHAN_Z);
1905
1906      fetch_texel(mach->Samplers[unit],
1907                  &r[0], &r[1], &r[2], &ZeroVec,
1908                  tgsi_sampler_lod_bias,
1909                  &r[0], &r[1], &r[2], &r[3]);
1910      break;
1911
1912   default:
1913      assert(0);
1914   }
1915
1916   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1917      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1918         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1919      }
1920   }
1921}
1922
1923static void
1924exec_txq(struct tgsi_exec_machine *mach,
1925         const struct tgsi_full_instruction *inst)
1926{
1927   struct tgsi_sampler *sampler;
1928   const uint unit = inst->Src[1].Register.Index;
1929   int result[4];
1930   union tgsi_exec_channel r[4], src;
1931   uint chan;
1932   int i,j;
1933
1934   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_INT);
1935   sampler = mach->Samplers[unit];
1936
1937   sampler->get_dims(sampler, src.i[0], result);
1938
1939   for (i = 0; i < QUAD_SIZE; i++) {
1940      for (j = 0; j < 4; j++) {
1941	 r[j].i[i] = result[j];
1942      }
1943   }
1944
1945   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1946      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1947	 store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
1948		    TGSI_EXEC_DATA_INT);
1949      }
1950   }
1951}
1952
1953static void
1954exec_sample(struct tgsi_exec_machine *mach,
1955            const struct tgsi_full_instruction *inst,
1956            uint modifier)
1957{
1958   const uint resource_unit = inst->Src[1].Register.Index;
1959   const uint sampler_unit = inst->Src[2].Register.Index;
1960   union tgsi_exec_channel r[4];
1961   const union tgsi_exec_channel *lod = &ZeroVec;
1962   enum tgsi_sampler_control control;
1963   uint chan;
1964
1965   if (modifier != TEX_MODIFIER_NONE) {
1966      if (modifier == TEX_MODIFIER_LOD_BIAS)
1967         FETCH(&r[3], 3, CHAN_X);
1968      else /*TEX_MODIFIER_LOD*/
1969         FETCH(&r[3], 0, CHAN_W);
1970
1971      if (modifier != TEX_MODIFIER_PROJECTED) {
1972         lod = &r[3];
1973      }
1974   }
1975
1976   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
1977      control = tgsi_sampler_lod_explicit;
1978   } else {
1979      control = tgsi_sampler_lod_bias;
1980   }
1981
1982   switch (mach->Resources[resource_unit].Resource) {
1983   case TGSI_TEXTURE_1D:
1984   case TGSI_TEXTURE_SHADOW1D:
1985      FETCH(&r[0], 0, CHAN_X);
1986
1987      if (modifier == TEX_MODIFIER_PROJECTED) {
1988         micro_div(&r[0], &r[0], &r[3]);
1989      }
1990
1991      fetch_texel(mach->Samplers[sampler_unit],
1992                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
1993                  control,
1994                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1995      break;
1996
1997   case TGSI_TEXTURE_2D:
1998   case TGSI_TEXTURE_RECT:
1999   case TGSI_TEXTURE_SHADOW2D:
2000   case TGSI_TEXTURE_SHADOWRECT:
2001      FETCH(&r[0], 0, CHAN_X);
2002      FETCH(&r[1], 0, CHAN_Y);
2003      FETCH(&r[2], 0, CHAN_Z);
2004
2005      if (modifier == TEX_MODIFIER_PROJECTED) {
2006         micro_div(&r[0], &r[0], &r[3]);
2007         micro_div(&r[1], &r[1], &r[3]);
2008         micro_div(&r[2], &r[2], &r[3]);
2009      }
2010
2011      fetch_texel(mach->Samplers[sampler_unit],
2012                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
2013                  control,
2014                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2015      break;
2016
2017   case TGSI_TEXTURE_3D:
2018   case TGSI_TEXTURE_CUBE:
2019      FETCH(&r[0], 0, CHAN_X);
2020      FETCH(&r[1], 0, CHAN_Y);
2021      FETCH(&r[2], 0, CHAN_Z);
2022
2023      if (modifier == TEX_MODIFIER_PROJECTED) {
2024         micro_div(&r[0], &r[0], &r[3]);
2025         micro_div(&r[1], &r[1], &r[3]);
2026         micro_div(&r[2], &r[2], &r[3]);
2027      }
2028
2029      fetch_texel(mach->Samplers[sampler_unit],
2030                  &r[0], &r[1], &r[2], lod,
2031                  control,
2032                  &r[0], &r[1], &r[2], &r[3]);
2033      break;
2034
2035   default:
2036      assert(0);
2037   }
2038
2039   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2040      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2041         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2042      }
2043   }
2044}
2045
2046static void
2047exec_sample_d(struct tgsi_exec_machine *mach,
2048              const struct tgsi_full_instruction *inst)
2049{
2050   const uint resource_unit = inst->Src[1].Register.Index;
2051   const uint sampler_unit = inst->Src[2].Register.Index;
2052   union tgsi_exec_channel r[4];
2053   uint chan;
2054   /*
2055    * XXX: This is fake SAMPLE_D -- the derivatives are not taken into account, yet.
2056    */
2057
2058   switch (mach->Resources[resource_unit].Resource) {
2059   case TGSI_TEXTURE_1D:
2060   case TGSI_TEXTURE_SHADOW1D:
2061
2062      FETCH(&r[0], 0, CHAN_X);
2063
2064      fetch_texel(mach->Samplers[sampler_unit],
2065                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
2066                  tgsi_sampler_lod_bias,
2067                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2068      break;
2069
2070   case TGSI_TEXTURE_2D:
2071   case TGSI_TEXTURE_RECT:
2072   case TGSI_TEXTURE_SHADOW2D:
2073   case TGSI_TEXTURE_SHADOWRECT:
2074
2075      FETCH(&r[0], 0, CHAN_X);
2076      FETCH(&r[1], 0, CHAN_Y);
2077      FETCH(&r[2], 0, CHAN_Z);
2078
2079      fetch_texel(mach->Samplers[sampler_unit],
2080                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
2081                  tgsi_sampler_lod_bias,
2082                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2083      break;
2084
2085   case TGSI_TEXTURE_3D:
2086   case TGSI_TEXTURE_CUBE:
2087
2088      FETCH(&r[0], 0, CHAN_X);
2089      FETCH(&r[1], 0, CHAN_Y);
2090      FETCH(&r[2], 0, CHAN_Z);
2091
2092      fetch_texel(mach->Samplers[sampler_unit],
2093                  &r[0], &r[1], &r[2], &ZeroVec,
2094                  tgsi_sampler_lod_bias,
2095                  &r[0], &r[1], &r[2], &r[3]);
2096      break;
2097
2098   default:
2099      assert(0);
2100   }
2101
2102   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2103      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2104         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2105      }
2106   }
2107}
2108
2109
2110/**
2111 * Evaluate a constant-valued coefficient at the position of the
2112 * current quad.
2113 */
2114static void
2115eval_constant_coef(
2116   struct tgsi_exec_machine *mach,
2117   unsigned attrib,
2118   unsigned chan )
2119{
2120   unsigned i;
2121
2122   for( i = 0; i < QUAD_SIZE; i++ ) {
2123      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
2124   }
2125}
2126
2127/**
2128 * Evaluate a linear-valued coefficient at the position of the
2129 * current quad.
2130 */
2131static void
2132eval_linear_coef(
2133   struct tgsi_exec_machine *mach,
2134   unsigned attrib,
2135   unsigned chan )
2136{
2137   const float x = mach->QuadPos.xyzw[0].f[0];
2138   const float y = mach->QuadPos.xyzw[1].f[0];
2139   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2140   const float dady = mach->InterpCoefs[attrib].dady[chan];
2141   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2142   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
2143   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
2144   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
2145   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
2146}
2147
2148/**
2149 * Evaluate a perspective-valued coefficient at the position of the
2150 * current quad.
2151 */
2152static void
2153eval_perspective_coef(
2154   struct tgsi_exec_machine *mach,
2155   unsigned attrib,
2156   unsigned chan )
2157{
2158   const float x = mach->QuadPos.xyzw[0].f[0];
2159   const float y = mach->QuadPos.xyzw[1].f[0];
2160   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2161   const float dady = mach->InterpCoefs[attrib].dady[chan];
2162   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2163   const float *w = mach->QuadPos.xyzw[3].f;
2164   /* divide by W here */
2165   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
2166   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
2167   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
2168   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
2169}
2170
2171
2172typedef void (* eval_coef_func)(
2173   struct tgsi_exec_machine *mach,
2174   unsigned attrib,
2175   unsigned chan );
2176
2177static void
2178exec_declaration(struct tgsi_exec_machine *mach,
2179                 const struct tgsi_full_declaration *decl)
2180{
2181   if (decl->Declaration.File == TGSI_FILE_RESOURCE) {
2182      mach->Resources[decl->Range.First] = decl->Resource;
2183      return;
2184   }
2185
2186   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
2187      if (decl->Declaration.File == TGSI_FILE_INPUT) {
2188         uint first, last, mask;
2189
2190         first = decl->Range.First;
2191         last = decl->Range.Last;
2192         mask = decl->Declaration.UsageMask;
2193
2194         /* XXX we could remove this special-case code since
2195          * mach->InterpCoefs[first].a0 should already have the
2196          * front/back-face value.  But we should first update the
2197          * ureg code to emit the right UsageMask value (WRITEMASK_X).
2198          * Then, we could remove the tgsi_exec_machine::Face field.
2199          */
2200         /* XXX make FACE a system value */
2201         if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
2202            uint i;
2203
2204            assert(decl->Semantic.Index == 0);
2205            assert(first == last);
2206
2207            for (i = 0; i < QUAD_SIZE; i++) {
2208               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
2209            }
2210         } else {
2211            eval_coef_func eval;
2212            uint i, j;
2213
2214            switch (decl->Declaration.Interpolate) {
2215            case TGSI_INTERPOLATE_CONSTANT:
2216               eval = eval_constant_coef;
2217               break;
2218
2219            case TGSI_INTERPOLATE_LINEAR:
2220               eval = eval_linear_coef;
2221               break;
2222
2223            case TGSI_INTERPOLATE_PERSPECTIVE:
2224               eval = eval_perspective_coef;
2225               break;
2226
2227            default:
2228               assert(0);
2229               return;
2230            }
2231
2232            for (j = 0; j < NUM_CHANNELS; j++) {
2233               if (mask & (1 << j)) {
2234                  for (i = first; i <= last; i++) {
2235                     eval(mach, i, j);
2236                  }
2237               }
2238            }
2239         }
2240      }
2241   }
2242
2243   if (decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
2244      mach->SysSemanticToIndex[decl->Declaration.Semantic] = decl->Range.First;
2245   }
2246}
2247
2248
2249typedef void (* micro_op)(union tgsi_exec_channel *dst);
2250
2251static void
2252exec_vector(struct tgsi_exec_machine *mach,
2253            const struct tgsi_full_instruction *inst,
2254            micro_op op,
2255            enum tgsi_exec_datatype dst_datatype)
2256{
2257   unsigned int chan;
2258
2259   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2260      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2261         union tgsi_exec_channel dst;
2262
2263         op(&dst);
2264         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2265      }
2266   }
2267}
2268
2269typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
2270                                const union tgsi_exec_channel *src);
2271
2272static void
2273exec_scalar_unary(struct tgsi_exec_machine *mach,
2274                  const struct tgsi_full_instruction *inst,
2275                  micro_unary_op op,
2276                  enum tgsi_exec_datatype dst_datatype,
2277                  enum tgsi_exec_datatype src_datatype)
2278{
2279   unsigned int chan;
2280   union tgsi_exec_channel src;
2281   union tgsi_exec_channel dst;
2282
2283   fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
2284   op(&dst, &src);
2285   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2286      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2287         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2288      }
2289   }
2290}
2291
2292static void
2293exec_vector_unary(struct tgsi_exec_machine *mach,
2294                  const struct tgsi_full_instruction *inst,
2295                  micro_unary_op op,
2296                  enum tgsi_exec_datatype dst_datatype,
2297                  enum tgsi_exec_datatype src_datatype)
2298{
2299   unsigned int chan;
2300   struct tgsi_exec_vector dst;
2301
2302   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2303      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2304         union tgsi_exec_channel src;
2305
2306         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
2307         op(&dst.xyzw[chan], &src);
2308      }
2309   }
2310   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2311      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2312         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2313      }
2314   }
2315}
2316
2317typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
2318                                 const union tgsi_exec_channel *src0,
2319                                 const union tgsi_exec_channel *src1);
2320
2321static void
2322exec_scalar_binary(struct tgsi_exec_machine *mach,
2323                   const struct tgsi_full_instruction *inst,
2324                   micro_binary_op op,
2325                   enum tgsi_exec_datatype dst_datatype,
2326                   enum tgsi_exec_datatype src_datatype)
2327{
2328   unsigned int chan;
2329   union tgsi_exec_channel src[2];
2330   union tgsi_exec_channel dst;
2331
2332   fetch_source(mach, &src[0], &inst->Src[0], CHAN_X, src_datatype);
2333   fetch_source(mach, &src[1], &inst->Src[1], CHAN_Y, src_datatype);
2334   op(&dst, &src[0], &src[1]);
2335   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2336      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2337         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2338      }
2339   }
2340}
2341
2342static void
2343exec_vector_binary(struct tgsi_exec_machine *mach,
2344                   const struct tgsi_full_instruction *inst,
2345                   micro_binary_op op,
2346                   enum tgsi_exec_datatype dst_datatype,
2347                   enum tgsi_exec_datatype src_datatype)
2348{
2349   unsigned int chan;
2350   struct tgsi_exec_vector dst;
2351
2352   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2353      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2354         union tgsi_exec_channel src[2];
2355
2356         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2357         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2358         op(&dst.xyzw[chan], &src[0], &src[1]);
2359      }
2360   }
2361   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2362      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2363         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2364      }
2365   }
2366}
2367
2368typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
2369                                  const union tgsi_exec_channel *src0,
2370                                  const union tgsi_exec_channel *src1,
2371                                  const union tgsi_exec_channel *src2);
2372
2373static void
2374exec_vector_trinary(struct tgsi_exec_machine *mach,
2375                    const struct tgsi_full_instruction *inst,
2376                    micro_trinary_op op,
2377                    enum tgsi_exec_datatype dst_datatype,
2378                    enum tgsi_exec_datatype src_datatype)
2379{
2380   unsigned int chan;
2381   struct tgsi_exec_vector dst;
2382
2383   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2384      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2385         union tgsi_exec_channel src[3];
2386
2387         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2388         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2389         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
2390         op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
2391      }
2392   }
2393   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2394      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2395         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2396      }
2397   }
2398}
2399
2400static void
2401exec_dp3(struct tgsi_exec_machine *mach,
2402         const struct tgsi_full_instruction *inst)
2403{
2404   unsigned int chan;
2405   union tgsi_exec_channel arg[3];
2406
2407   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2408   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2409   micro_mul(&arg[2], &arg[0], &arg[1]);
2410
2411   for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2412      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2413      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2414      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2415   }
2416
2417   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2418      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2419         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2420      }
2421   }
2422}
2423
2424static void
2425exec_dp4(struct tgsi_exec_machine *mach,
2426         const struct tgsi_full_instruction *inst)
2427{
2428   unsigned int chan;
2429   union tgsi_exec_channel arg[3];
2430
2431   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2432   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2433   micro_mul(&arg[2], &arg[0], &arg[1]);
2434
2435   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2436      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2437      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2438      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2439   }
2440
2441   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2442      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2443         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2444      }
2445   }
2446}
2447
2448static void
2449exec_dp2a(struct tgsi_exec_machine *mach,
2450          const struct tgsi_full_instruction *inst)
2451{
2452   unsigned int chan;
2453   union tgsi_exec_channel arg[3];
2454
2455   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2456   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2457   micro_mul(&arg[2], &arg[0], &arg[1]);
2458
2459   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2460   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2461   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2462
2463   fetch_source(mach, &arg[1], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2464   micro_add(&arg[0], &arg[0], &arg[1]);
2465
2466   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2467      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2468         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2469      }
2470   }
2471}
2472
2473static void
2474exec_dph(struct tgsi_exec_machine *mach,
2475         const struct tgsi_full_instruction *inst)
2476{
2477   unsigned int chan;
2478   union tgsi_exec_channel arg[3];
2479
2480   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2481   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2482   micro_mul(&arg[2], &arg[0], &arg[1]);
2483
2484   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2485   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2486   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2487
2488   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2489   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2490   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2491
2492   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2493   micro_add(&arg[0], &arg[0], &arg[1]);
2494
2495   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2496      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2497         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2498      }
2499   }
2500}
2501
2502static void
2503exec_dp2(struct tgsi_exec_machine *mach,
2504         const struct tgsi_full_instruction *inst)
2505{
2506   unsigned int chan;
2507   union tgsi_exec_channel arg[3];
2508
2509   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2510   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2511   micro_mul(&arg[2], &arg[0], &arg[1]);
2512
2513   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2514   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2515   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2516
2517   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2518      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2519         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2520      }
2521   }
2522}
2523
2524static void
2525exec_nrm4(struct tgsi_exec_machine *mach,
2526          const struct tgsi_full_instruction *inst)
2527{
2528   unsigned int chan;
2529   union tgsi_exec_channel arg[4];
2530   union tgsi_exec_channel scale;
2531
2532   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2533   micro_mul(&scale, &arg[0], &arg[0]);
2534
2535   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2536      union tgsi_exec_channel product;
2537
2538      fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2539      micro_mul(&product, &arg[chan], &arg[chan]);
2540      micro_add(&scale, &scale, &product);
2541   }
2542
2543   micro_rsq(&scale, &scale);
2544
2545   for (chan = CHAN_X; chan <= CHAN_W; chan++) {
2546      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2547         micro_mul(&arg[chan], &arg[chan], &scale);
2548         store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2549      }
2550   }
2551}
2552
2553static void
2554exec_nrm3(struct tgsi_exec_machine *mach,
2555          const struct tgsi_full_instruction *inst)
2556{
2557   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2558      unsigned int chan;
2559      union tgsi_exec_channel arg[3];
2560      union tgsi_exec_channel scale;
2561
2562      fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2563      micro_mul(&scale, &arg[0], &arg[0]);
2564
2565      for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2566         union tgsi_exec_channel product;
2567
2568         fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2569         micro_mul(&product, &arg[chan], &arg[chan]);
2570         micro_add(&scale, &scale, &product);
2571      }
2572
2573      micro_rsq(&scale, &scale);
2574
2575      for (chan = CHAN_X; chan <= CHAN_Z; chan++) {
2576         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2577            micro_mul(&arg[chan], &arg[chan], &scale);
2578            store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2579         }
2580      }
2581   }
2582
2583   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2584      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2585   }
2586}
2587
2588static void
2589exec_scs(struct tgsi_exec_machine *mach,
2590         const struct tgsi_full_instruction *inst)
2591{
2592   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) {
2593      union tgsi_exec_channel arg;
2594      union tgsi_exec_channel result;
2595
2596      fetch_source(mach, &arg, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2597
2598      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2599         micro_cos(&result, &arg);
2600         store_dest(mach, &result, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2601      }
2602      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2603         micro_sin(&result, &arg);
2604         store_dest(mach, &result, &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2605      }
2606   }
2607   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2608      store_dest(mach, &ZeroVec, &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2609   }
2610   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2611      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2612   }
2613}
2614
2615static void
2616exec_x2d(struct tgsi_exec_machine *mach,
2617         const struct tgsi_full_instruction *inst)
2618{
2619   union tgsi_exec_channel r[4];
2620   union tgsi_exec_channel d[2];
2621
2622   fetch_source(mach, &r[0], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2623   fetch_source(mach, &r[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2624   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XZ) {
2625      fetch_source(mach, &r[2], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2626      micro_mul(&r[2], &r[2], &r[0]);
2627      fetch_source(mach, &r[3], &inst->Src[2], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2628      micro_mul(&r[3], &r[3], &r[1]);
2629      micro_add(&r[2], &r[2], &r[3]);
2630      fetch_source(mach, &r[3], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2631      micro_add(&d[0], &r[2], &r[3]);
2632   }
2633   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YW) {
2634      fetch_source(mach, &r[2], &inst->Src[2], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2635      micro_mul(&r[2], &r[2], &r[0]);
2636      fetch_source(mach, &r[3], &inst->Src[2], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2637      micro_mul(&r[3], &r[3], &r[1]);
2638      micro_add(&r[2], &r[2], &r[3]);
2639      fetch_source(mach, &r[3], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2640      micro_add(&d[1], &r[2], &r[3]);
2641   }
2642   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2643      store_dest(mach, &d[0], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2644   }
2645   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2646      store_dest(mach, &d[1], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2647   }
2648   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2649      store_dest(mach, &d[0], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2650   }
2651   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2652      store_dest(mach, &d[1], &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2653   }
2654}
2655
2656static void
2657exec_rfl(struct tgsi_exec_machine *mach,
2658         const struct tgsi_full_instruction *inst)
2659{
2660   union tgsi_exec_channel r[9];
2661
2662   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2663      /* r0 = dp3(src0, src0) */
2664      fetch_source(mach, &r[2], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2665      micro_mul(&r[0], &r[2], &r[2]);
2666      fetch_source(mach, &r[4], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2667      micro_mul(&r[8], &r[4], &r[4]);
2668      micro_add(&r[0], &r[0], &r[8]);
2669      fetch_source(mach, &r[6], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2670      micro_mul(&r[8], &r[6], &r[6]);
2671      micro_add(&r[0], &r[0], &r[8]);
2672
2673      /* r1 = dp3(src0, src1) */
2674      fetch_source(mach, &r[3], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2675      micro_mul(&r[1], &r[2], &r[3]);
2676      fetch_source(mach, &r[5], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2677      micro_mul(&r[8], &r[4], &r[5]);
2678      micro_add(&r[1], &r[1], &r[8]);
2679      fetch_source(mach, &r[7], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2680      micro_mul(&r[8], &r[6], &r[7]);
2681      micro_add(&r[1], &r[1], &r[8]);
2682
2683      /* r1 = 2 * r1 / r0 */
2684      micro_add(&r[1], &r[1], &r[1]);
2685      micro_div(&r[1], &r[1], &r[0]);
2686
2687      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2688         micro_mul(&r[2], &r[2], &r[1]);
2689         micro_sub(&r[2], &r[2], &r[3]);
2690         store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2691      }
2692      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2693         micro_mul(&r[4], &r[4], &r[1]);
2694         micro_sub(&r[4], &r[4], &r[5]);
2695         store_dest(mach, &r[4], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2696      }
2697      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2698         micro_mul(&r[6], &r[6], &r[1]);
2699         micro_sub(&r[6], &r[6], &r[7]);
2700         store_dest(mach, &r[6], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2701      }
2702   }
2703   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2704      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2705   }
2706}
2707
2708static void
2709exec_xpd(struct tgsi_exec_machine *mach,
2710         const struct tgsi_full_instruction *inst)
2711{
2712   union tgsi_exec_channel r[6];
2713   union tgsi_exec_channel d[3];
2714
2715   fetch_source(mach, &r[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2716   fetch_source(mach, &r[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2717
2718   micro_mul(&r[2], &r[0], &r[1]);
2719
2720   fetch_source(mach, &r[3], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2721   fetch_source(mach, &r[4], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2722
2723   micro_mul(&r[5], &r[3], &r[4] );
2724   micro_sub(&d[CHAN_X], &r[2], &r[5]);
2725
2726   fetch_source(mach, &r[2], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2727
2728   micro_mul(&r[3], &r[3], &r[2]);
2729
2730   fetch_source(mach, &r[5], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2731
2732   micro_mul(&r[1], &r[1], &r[5]);
2733   micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2734
2735   micro_mul(&r[5], &r[5], &r[4]);
2736   micro_mul(&r[0], &r[0], &r[2]);
2737   micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2738
2739   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2740      store_dest(mach, &d[CHAN_X], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2741   }
2742   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2743      store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2744   }
2745   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2746      store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2747   }
2748   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2749      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2750   }
2751}
2752
2753static void
2754exec_dst(struct tgsi_exec_machine *mach,
2755         const struct tgsi_full_instruction *inst)
2756{
2757   union tgsi_exec_channel r[2];
2758   union tgsi_exec_channel d[4];
2759
2760   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2761      fetch_source(mach, &r[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2762      fetch_source(mach, &r[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2763      micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2764   }
2765   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2766      fetch_source(mach, &d[CHAN_Z], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2767   }
2768   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2769      fetch_source(mach, &d[CHAN_W], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2770   }
2771
2772   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2773      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2774   }
2775   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2776      store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2777   }
2778   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2779      store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2780   }
2781   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2782      store_dest(mach, &d[CHAN_W], &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2783   }
2784}
2785
2786static void
2787exec_log(struct tgsi_exec_machine *mach,
2788         const struct tgsi_full_instruction *inst)
2789{
2790   union tgsi_exec_channel r[3];
2791
2792   fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2793   micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
2794   micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
2795   micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
2796   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2797      store_dest(mach, &r[0], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2798   }
2799   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2800      micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
2801      micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
2802      store_dest(mach, &r[0], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2803   }
2804   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2805      store_dest(mach, &r[1], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2806   }
2807   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2808      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2809   }
2810}
2811
2812static void
2813exec_exp(struct tgsi_exec_machine *mach,
2814         const struct tgsi_full_instruction *inst)
2815{
2816   union tgsi_exec_channel r[3];
2817
2818   fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2819   micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
2820   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2821      micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
2822      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2823   }
2824   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2825      micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
2826      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2827   }
2828   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2829      micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
2830      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2831   }
2832   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2833      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2834   }
2835}
2836
2837static void
2838exec_lit(struct tgsi_exec_machine *mach,
2839         const struct tgsi_full_instruction *inst)
2840{
2841   union tgsi_exec_channel r[3];
2842   union tgsi_exec_channel d[3];
2843
2844   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2845      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2846   }
2847   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
2848      fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2849      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2850         micro_max(&d[CHAN_Y], &r[0], &ZeroVec);
2851         store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2852      }
2853
2854      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2855         fetch_source(mach, &r[1], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2856         micro_max(&r[1], &r[1], &ZeroVec);
2857
2858         fetch_source(mach, &r[2], &inst->Src[0], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2859         micro_min(&r[2], &r[2], &P128Vec);
2860         micro_max(&r[2], &r[2], &M128Vec);
2861         micro_pow(&r[1], &r[1], &r[2]);
2862         micro_lt(&d[CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
2863         store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2864      }
2865   }
2866   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2867      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2868   }
2869}
2870
2871static void
2872exec_break(struct tgsi_exec_machine *mach)
2873{
2874   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
2875      /* turn off loop channels for each enabled exec channel */
2876      mach->LoopMask &= ~mach->ExecMask;
2877      /* Todo: if mach->LoopMask == 0, jump to end of loop */
2878      UPDATE_EXEC_MASK(mach);
2879   } else {
2880      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
2881
2882      mach->Switch.mask = 0x0;
2883
2884      UPDATE_EXEC_MASK(mach);
2885   }
2886}
2887
2888static void
2889exec_switch(struct tgsi_exec_machine *mach,
2890            const struct tgsi_full_instruction *inst)
2891{
2892   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
2893   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
2894
2895   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
2896   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2897   mach->Switch.mask = 0x0;
2898   mach->Switch.defaultMask = 0x0;
2899
2900   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
2901   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
2902
2903   UPDATE_EXEC_MASK(mach);
2904}
2905
2906static void
2907exec_case(struct tgsi_exec_machine *mach,
2908          const struct tgsi_full_instruction *inst)
2909{
2910   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2911   union tgsi_exec_channel src;
2912   uint mask = 0;
2913
2914   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2915
2916   if (mach->Switch.selector.u[0] == src.u[0]) {
2917      mask |= 0x1;
2918   }
2919   if (mach->Switch.selector.u[1] == src.u[1]) {
2920      mask |= 0x2;
2921   }
2922   if (mach->Switch.selector.u[2] == src.u[2]) {
2923      mask |= 0x4;
2924   }
2925   if (mach->Switch.selector.u[3] == src.u[3]) {
2926      mask |= 0x8;
2927   }
2928
2929   mach->Switch.defaultMask |= mask;
2930
2931   mach->Switch.mask |= mask & prevMask;
2932
2933   UPDATE_EXEC_MASK(mach);
2934}
2935
2936static void
2937exec_default(struct tgsi_exec_machine *mach)
2938{
2939   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2940
2941   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
2942
2943   UPDATE_EXEC_MASK(mach);
2944}
2945
2946static void
2947exec_endswitch(struct tgsi_exec_machine *mach)
2948{
2949   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
2950   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
2951
2952   UPDATE_EXEC_MASK(mach);
2953}
2954
2955static void
2956micro_i2f(union tgsi_exec_channel *dst,
2957          const union tgsi_exec_channel *src)
2958{
2959   dst->f[0] = (float)src->i[0];
2960   dst->f[1] = (float)src->i[1];
2961   dst->f[2] = (float)src->i[2];
2962   dst->f[3] = (float)src->i[3];
2963}
2964
2965static void
2966micro_not(union tgsi_exec_channel *dst,
2967          const union tgsi_exec_channel *src)
2968{
2969   dst->u[0] = ~src->u[0];
2970   dst->u[1] = ~src->u[1];
2971   dst->u[2] = ~src->u[2];
2972   dst->u[3] = ~src->u[3];
2973}
2974
2975static void
2976micro_shl(union tgsi_exec_channel *dst,
2977          const union tgsi_exec_channel *src0,
2978          const union tgsi_exec_channel *src1)
2979{
2980   dst->u[0] = src0->u[0] << src1->u[0];
2981   dst->u[1] = src0->u[1] << src1->u[1];
2982   dst->u[2] = src0->u[2] << src1->u[2];
2983   dst->u[3] = src0->u[3] << src1->u[3];
2984}
2985
2986static void
2987micro_and(union tgsi_exec_channel *dst,
2988          const union tgsi_exec_channel *src0,
2989          const union tgsi_exec_channel *src1)
2990{
2991   dst->u[0] = src0->u[0] & src1->u[0];
2992   dst->u[1] = src0->u[1] & src1->u[1];
2993   dst->u[2] = src0->u[2] & src1->u[2];
2994   dst->u[3] = src0->u[3] & src1->u[3];
2995}
2996
2997static void
2998micro_or(union tgsi_exec_channel *dst,
2999         const union tgsi_exec_channel *src0,
3000         const union tgsi_exec_channel *src1)
3001{
3002   dst->u[0] = src0->u[0] | src1->u[0];
3003   dst->u[1] = src0->u[1] | src1->u[1];
3004   dst->u[2] = src0->u[2] | src1->u[2];
3005   dst->u[3] = src0->u[3] | src1->u[3];
3006}
3007
3008static void
3009micro_xor(union tgsi_exec_channel *dst,
3010          const union tgsi_exec_channel *src0,
3011          const union tgsi_exec_channel *src1)
3012{
3013   dst->u[0] = src0->u[0] ^ src1->u[0];
3014   dst->u[1] = src0->u[1] ^ src1->u[1];
3015   dst->u[2] = src0->u[2] ^ src1->u[2];
3016   dst->u[3] = src0->u[3] ^ src1->u[3];
3017}
3018
3019static void
3020micro_mod(union tgsi_exec_channel *dst,
3021          const union tgsi_exec_channel *src0,
3022          const union tgsi_exec_channel *src1)
3023{
3024   dst->i[0] = src0->i[0] % src1->i[0];
3025   dst->i[1] = src0->i[1] % src1->i[1];
3026   dst->i[2] = src0->i[2] % src1->i[2];
3027   dst->i[3] = src0->i[3] % src1->i[3];
3028}
3029
3030static void
3031micro_f2i(union tgsi_exec_channel *dst,
3032          const union tgsi_exec_channel *src)
3033{
3034   dst->i[0] = (int)src->f[0];
3035   dst->i[1] = (int)src->f[1];
3036   dst->i[2] = (int)src->f[2];
3037   dst->i[3] = (int)src->f[3];
3038}
3039
3040static void
3041micro_idiv(union tgsi_exec_channel *dst,
3042           const union tgsi_exec_channel *src0,
3043           const union tgsi_exec_channel *src1)
3044{
3045   dst->i[0] = src0->i[0] / src1->i[0];
3046   dst->i[1] = src0->i[1] / src1->i[1];
3047   dst->i[2] = src0->i[2] / src1->i[2];
3048   dst->i[3] = src0->i[3] / src1->i[3];
3049}
3050
3051static void
3052micro_imax(union tgsi_exec_channel *dst,
3053           const union tgsi_exec_channel *src0,
3054           const union tgsi_exec_channel *src1)
3055{
3056   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
3057   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
3058   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
3059   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
3060}
3061
3062static void
3063micro_imin(union tgsi_exec_channel *dst,
3064           const union tgsi_exec_channel *src0,
3065           const union tgsi_exec_channel *src1)
3066{
3067   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
3068   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
3069   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
3070   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
3071}
3072
3073static void
3074micro_isge(union tgsi_exec_channel *dst,
3075           const union tgsi_exec_channel *src0,
3076           const union tgsi_exec_channel *src1)
3077{
3078   dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
3079   dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
3080   dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
3081   dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
3082}
3083
3084static void
3085micro_ishr(union tgsi_exec_channel *dst,
3086           const union tgsi_exec_channel *src0,
3087           const union tgsi_exec_channel *src1)
3088{
3089   dst->i[0] = src0->i[0] >> src1->i[0];
3090   dst->i[1] = src0->i[1] >> src1->i[1];
3091   dst->i[2] = src0->i[2] >> src1->i[2];
3092   dst->i[3] = src0->i[3] >> src1->i[3];
3093}
3094
3095static void
3096micro_islt(union tgsi_exec_channel *dst,
3097           const union tgsi_exec_channel *src0,
3098           const union tgsi_exec_channel *src1)
3099{
3100   dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
3101   dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
3102   dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
3103   dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
3104}
3105
3106static void
3107micro_f2u(union tgsi_exec_channel *dst,
3108          const union tgsi_exec_channel *src)
3109{
3110   dst->u[0] = (uint)src->f[0];
3111   dst->u[1] = (uint)src->f[1];
3112   dst->u[2] = (uint)src->f[2];
3113   dst->u[3] = (uint)src->f[3];
3114}
3115
3116static void
3117micro_u2f(union tgsi_exec_channel *dst,
3118          const union tgsi_exec_channel *src)
3119{
3120   dst->f[0] = (float)src->u[0];
3121   dst->f[1] = (float)src->u[1];
3122   dst->f[2] = (float)src->u[2];
3123   dst->f[3] = (float)src->u[3];
3124}
3125
3126static void
3127micro_uadd(union tgsi_exec_channel *dst,
3128           const union tgsi_exec_channel *src0,
3129           const union tgsi_exec_channel *src1)
3130{
3131   dst->u[0] = src0->u[0] + src1->u[0];
3132   dst->u[1] = src0->u[1] + src1->u[1];
3133   dst->u[2] = src0->u[2] + src1->u[2];
3134   dst->u[3] = src0->u[3] + src1->u[3];
3135}
3136
3137static void
3138micro_udiv(union tgsi_exec_channel *dst,
3139           const union tgsi_exec_channel *src0,
3140           const union tgsi_exec_channel *src1)
3141{
3142   dst->u[0] = src0->u[0] / src1->u[0];
3143   dst->u[1] = src0->u[1] / src1->u[1];
3144   dst->u[2] = src0->u[2] / src1->u[2];
3145   dst->u[3] = src0->u[3] / src1->u[3];
3146}
3147
3148static void
3149micro_umad(union tgsi_exec_channel *dst,
3150           const union tgsi_exec_channel *src0,
3151           const union tgsi_exec_channel *src1,
3152           const union tgsi_exec_channel *src2)
3153{
3154   dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
3155   dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
3156   dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
3157   dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
3158}
3159
3160static void
3161micro_umax(union tgsi_exec_channel *dst,
3162           const union tgsi_exec_channel *src0,
3163           const union tgsi_exec_channel *src1)
3164{
3165   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
3166   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
3167   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
3168   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
3169}
3170
3171static void
3172micro_umin(union tgsi_exec_channel *dst,
3173           const union tgsi_exec_channel *src0,
3174           const union tgsi_exec_channel *src1)
3175{
3176   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
3177   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
3178   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
3179   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
3180}
3181
3182static void
3183micro_umod(union tgsi_exec_channel *dst,
3184           const union tgsi_exec_channel *src0,
3185           const union tgsi_exec_channel *src1)
3186{
3187   dst->u[0] = src0->u[0] % src1->u[0];
3188   dst->u[1] = src0->u[1] % src1->u[1];
3189   dst->u[2] = src0->u[2] % src1->u[2];
3190   dst->u[3] = src0->u[3] % src1->u[3];
3191}
3192
3193static void
3194micro_umul(union tgsi_exec_channel *dst,
3195           const union tgsi_exec_channel *src0,
3196           const union tgsi_exec_channel *src1)
3197{
3198   dst->u[0] = src0->u[0] * src1->u[0];
3199   dst->u[1] = src0->u[1] * src1->u[1];
3200   dst->u[2] = src0->u[2] * src1->u[2];
3201   dst->u[3] = src0->u[3] * src1->u[3];
3202}
3203
3204static void
3205micro_useq(union tgsi_exec_channel *dst,
3206           const union tgsi_exec_channel *src0,
3207           const union tgsi_exec_channel *src1)
3208{
3209   dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
3210   dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
3211   dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
3212   dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
3213}
3214
3215static void
3216micro_usge(union tgsi_exec_channel *dst,
3217           const union tgsi_exec_channel *src0,
3218           const union tgsi_exec_channel *src1)
3219{
3220   dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
3221   dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
3222   dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
3223   dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
3224}
3225
3226static void
3227micro_ushr(union tgsi_exec_channel *dst,
3228           const union tgsi_exec_channel *src0,
3229           const union tgsi_exec_channel *src1)
3230{
3231   dst->u[0] = src0->u[0] >> src1->u[0];
3232   dst->u[1] = src0->u[1] >> src1->u[1];
3233   dst->u[2] = src0->u[2] >> src1->u[2];
3234   dst->u[3] = src0->u[3] >> src1->u[3];
3235}
3236
3237static void
3238micro_uslt(union tgsi_exec_channel *dst,
3239           const union tgsi_exec_channel *src0,
3240           const union tgsi_exec_channel *src1)
3241{
3242   dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
3243   dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
3244   dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
3245   dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
3246}
3247
3248static void
3249micro_usne(union tgsi_exec_channel *dst,
3250           const union tgsi_exec_channel *src0,
3251           const union tgsi_exec_channel *src1)
3252{
3253   dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
3254   dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
3255   dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
3256   dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
3257}
3258
3259static void
3260exec_instruction(
3261   struct tgsi_exec_machine *mach,
3262   const struct tgsi_full_instruction *inst,
3263   int *pc )
3264{
3265   union tgsi_exec_channel r[10];
3266
3267   (*pc)++;
3268
3269   switch (inst->Instruction.Opcode) {
3270   case TGSI_OPCODE_ARL:
3271      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3272      break;
3273
3274   case TGSI_OPCODE_MOV:
3275      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3276      break;
3277
3278   case TGSI_OPCODE_LIT:
3279      exec_lit(mach, inst);
3280      break;
3281
3282   case TGSI_OPCODE_RCP:
3283      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3284      break;
3285
3286   case TGSI_OPCODE_RSQ:
3287      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3288      break;
3289
3290   case TGSI_OPCODE_EXP:
3291      exec_exp(mach, inst);
3292      break;
3293
3294   case TGSI_OPCODE_LOG:
3295      exec_log(mach, inst);
3296      break;
3297
3298   case TGSI_OPCODE_MUL:
3299      exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3300      break;
3301
3302   case TGSI_OPCODE_ADD:
3303      exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3304      break;
3305
3306   case TGSI_OPCODE_DP3:
3307      exec_dp3(mach, inst);
3308      break;
3309
3310   case TGSI_OPCODE_DP4:
3311      exec_dp4(mach, inst);
3312      break;
3313
3314   case TGSI_OPCODE_DST:
3315      exec_dst(mach, inst);
3316      break;
3317
3318   case TGSI_OPCODE_MIN:
3319      exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3320      break;
3321
3322   case TGSI_OPCODE_MAX:
3323      exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3324      break;
3325
3326   case TGSI_OPCODE_SLT:
3327      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3328      break;
3329
3330   case TGSI_OPCODE_SGE:
3331      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3332      break;
3333
3334   case TGSI_OPCODE_MAD:
3335      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3336      break;
3337
3338   case TGSI_OPCODE_SUB:
3339      exec_vector_binary(mach, inst, micro_sub, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3340      break;
3341
3342   case TGSI_OPCODE_LRP:
3343      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3344      break;
3345
3346   case TGSI_OPCODE_CND:
3347      exec_vector_trinary(mach, inst, micro_cnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3348      break;
3349
3350   case TGSI_OPCODE_DP2A:
3351      exec_dp2a(mach, inst);
3352      break;
3353
3354   case TGSI_OPCODE_FRC:
3355      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3356      break;
3357
3358   case TGSI_OPCODE_CLAMP:
3359      exec_vector_trinary(mach, inst, micro_clamp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3360      break;
3361
3362   case TGSI_OPCODE_FLR:
3363      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3364      break;
3365
3366   case TGSI_OPCODE_ROUND:
3367      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3368      break;
3369
3370   case TGSI_OPCODE_EX2:
3371      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3372      break;
3373
3374   case TGSI_OPCODE_LG2:
3375      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3376      break;
3377
3378   case TGSI_OPCODE_POW:
3379      exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3380      break;
3381
3382   case TGSI_OPCODE_XPD:
3383      exec_xpd(mach, inst);
3384      break;
3385
3386   case TGSI_OPCODE_ABS:
3387      exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3388      break;
3389
3390   case TGSI_OPCODE_RCC:
3391      exec_scalar_unary(mach, inst, micro_rcc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3392      break;
3393
3394   case TGSI_OPCODE_DPH:
3395      exec_dph(mach, inst);
3396      break;
3397
3398   case TGSI_OPCODE_COS:
3399      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3400      break;
3401
3402   case TGSI_OPCODE_DDX:
3403      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3404      break;
3405
3406   case TGSI_OPCODE_DDY:
3407      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3408      break;
3409
3410   case TGSI_OPCODE_KILP:
3411      exec_kilp (mach, inst);
3412      break;
3413
3414   case TGSI_OPCODE_KIL:
3415      exec_kil (mach, inst);
3416      break;
3417
3418   case TGSI_OPCODE_PK2H:
3419      assert (0);
3420      break;
3421
3422   case TGSI_OPCODE_PK2US:
3423      assert (0);
3424      break;
3425
3426   case TGSI_OPCODE_PK4B:
3427      assert (0);
3428      break;
3429
3430   case TGSI_OPCODE_PK4UB:
3431      assert (0);
3432      break;
3433
3434   case TGSI_OPCODE_RFL:
3435      exec_rfl(mach, inst);
3436      break;
3437
3438   case TGSI_OPCODE_SEQ:
3439      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3440      break;
3441
3442   case TGSI_OPCODE_SFL:
3443      exec_vector(mach, inst, micro_sfl, TGSI_EXEC_DATA_FLOAT);
3444      break;
3445
3446   case TGSI_OPCODE_SGT:
3447      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3448      break;
3449
3450   case TGSI_OPCODE_SIN:
3451      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3452      break;
3453
3454   case TGSI_OPCODE_SLE:
3455      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3456      break;
3457
3458   case TGSI_OPCODE_SNE:
3459      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3460      break;
3461
3462   case TGSI_OPCODE_STR:
3463      exec_vector(mach, inst, micro_str, TGSI_EXEC_DATA_FLOAT);
3464      break;
3465
3466   case TGSI_OPCODE_TEX:
3467      /* simple texture lookup */
3468      /* src[0] = texcoord */
3469      /* src[1] = sampler unit */
3470      exec_tex(mach, inst, TEX_MODIFIER_NONE);
3471      break;
3472
3473   case TGSI_OPCODE_TXB:
3474      /* Texture lookup with lod bias */
3475      /* src[0] = texcoord (src[0].w = LOD bias) */
3476      /* src[1] = sampler unit */
3477      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
3478      break;
3479
3480   case TGSI_OPCODE_TXD:
3481      /* Texture lookup with explict partial derivatives */
3482      /* src[0] = texcoord */
3483      /* src[1] = d[strq]/dx */
3484      /* src[2] = d[strq]/dy */
3485      /* src[3] = sampler unit */
3486      exec_txd(mach, inst);
3487      break;
3488
3489   case TGSI_OPCODE_TXL:
3490      /* Texture lookup with explit LOD */
3491      /* src[0] = texcoord (src[0].w = LOD) */
3492      /* src[1] = sampler unit */
3493      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
3494      break;
3495
3496   case TGSI_OPCODE_TXP:
3497      /* Texture lookup with projection */
3498      /* src[0] = texcoord (src[0].w = projection) */
3499      /* src[1] = sampler unit */
3500      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
3501      break;
3502
3503   case TGSI_OPCODE_UP2H:
3504      assert (0);
3505      break;
3506
3507   case TGSI_OPCODE_UP2US:
3508      assert (0);
3509      break;
3510
3511   case TGSI_OPCODE_UP4B:
3512      assert (0);
3513      break;
3514
3515   case TGSI_OPCODE_UP4UB:
3516      assert (0);
3517      break;
3518
3519   case TGSI_OPCODE_X2D:
3520      exec_x2d(mach, inst);
3521      break;
3522
3523   case TGSI_OPCODE_ARA:
3524      assert (0);
3525      break;
3526
3527   case TGSI_OPCODE_ARR:
3528      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3529      break;
3530
3531   case TGSI_OPCODE_BRA:
3532      assert (0);
3533      break;
3534
3535   case TGSI_OPCODE_CAL:
3536      /* skip the call if no execution channels are enabled */
3537      if (mach->ExecMask) {
3538         /* do the call */
3539
3540         /* First, record the depths of the execution stacks.
3541          * This is important for deeply nested/looped return statements.
3542          * We have to unwind the stacks by the correct amount.  For a
3543          * real code generator, we could determine the number of entries
3544          * to pop off each stack with simple static analysis and avoid
3545          * implementing this data structure at run time.
3546          */
3547         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
3548         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
3549         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
3550         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
3551         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
3552         /* note that PC was already incremented above */
3553         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
3554
3555         mach->CallStackTop++;
3556
3557         /* Second, push the Cond, Loop, Cont, Func stacks */
3558         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3559         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3560         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3561         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3562         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3563         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
3564
3565         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3566         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3567         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3568         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3569         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3570         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
3571
3572         /* Finally, jump to the subroutine */
3573         *pc = inst->Label.Label;
3574      }
3575      break;
3576
3577   case TGSI_OPCODE_RET:
3578      mach->FuncMask &= ~mach->ExecMask;
3579      UPDATE_EXEC_MASK(mach);
3580
3581      if (mach->FuncMask == 0x0) {
3582         /* really return now (otherwise, keep executing */
3583
3584         if (mach->CallStackTop == 0) {
3585            /* returning from main() */
3586            mach->CondStackTop = 0;
3587            mach->LoopStackTop = 0;
3588            *pc = -1;
3589            return;
3590         }
3591
3592         assert(mach->CallStackTop > 0);
3593         mach->CallStackTop--;
3594
3595         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3596         mach->CondMask = mach->CondStack[mach->CondStackTop];
3597
3598         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3599         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3600
3601         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3602         mach->ContMask = mach->ContStack[mach->ContStackTop];
3603
3604         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3605         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3606
3607         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3608         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3609
3610         assert(mach->FuncStackTop > 0);
3611         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3612
3613         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3614
3615         UPDATE_EXEC_MASK(mach);
3616      }
3617      break;
3618
3619   case TGSI_OPCODE_SSG:
3620      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3621      break;
3622
3623   case TGSI_OPCODE_CMP:
3624      exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3625      break;
3626
3627   case TGSI_OPCODE_SCS:
3628      exec_scs(mach, inst);
3629      break;
3630
3631   case TGSI_OPCODE_NRM:
3632      exec_nrm3(mach, inst);
3633      break;
3634
3635   case TGSI_OPCODE_NRM4:
3636      exec_nrm4(mach, inst);
3637      break;
3638
3639   case TGSI_OPCODE_DIV:
3640      exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3641      break;
3642
3643   case TGSI_OPCODE_DP2:
3644      exec_dp2(mach, inst);
3645      break;
3646
3647   case TGSI_OPCODE_IF:
3648      /* push CondMask */
3649      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3650      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3651      FETCH( &r[0], 0, CHAN_X );
3652      /* update CondMask */
3653      if( ! r[0].u[0] ) {
3654         mach->CondMask &= ~0x1;
3655      }
3656      if( ! r[0].u[1] ) {
3657         mach->CondMask &= ~0x2;
3658      }
3659      if( ! r[0].u[2] ) {
3660         mach->CondMask &= ~0x4;
3661      }
3662      if( ! r[0].u[3] ) {
3663         mach->CondMask &= ~0x8;
3664      }
3665      UPDATE_EXEC_MASK(mach);
3666      /* Todo: If CondMask==0, jump to ELSE */
3667      break;
3668
3669   case TGSI_OPCODE_ELSE:
3670      /* invert CondMask wrt previous mask */
3671      {
3672         uint prevMask;
3673         assert(mach->CondStackTop > 0);
3674         prevMask = mach->CondStack[mach->CondStackTop - 1];
3675         mach->CondMask = ~mach->CondMask & prevMask;
3676         UPDATE_EXEC_MASK(mach);
3677         /* Todo: If CondMask==0, jump to ENDIF */
3678      }
3679      break;
3680
3681   case TGSI_OPCODE_ENDIF:
3682      /* pop CondMask */
3683      assert(mach->CondStackTop > 0);
3684      mach->CondMask = mach->CondStack[--mach->CondStackTop];
3685      UPDATE_EXEC_MASK(mach);
3686      break;
3687
3688   case TGSI_OPCODE_END:
3689      /* make sure we end primitives which haven't
3690       * been explicitly emitted */
3691      conditional_emit_primitive(mach);
3692      /* halt execution */
3693      *pc = -1;
3694      break;
3695
3696   case TGSI_OPCODE_PUSHA:
3697      assert (0);
3698      break;
3699
3700   case TGSI_OPCODE_POPA:
3701      assert (0);
3702      break;
3703
3704   case TGSI_OPCODE_CEIL:
3705      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3706      break;
3707
3708   case TGSI_OPCODE_I2F:
3709      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
3710      break;
3711
3712   case TGSI_OPCODE_NOT:
3713      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3714      break;
3715
3716   case TGSI_OPCODE_TRUNC:
3717      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3718      break;
3719
3720   case TGSI_OPCODE_SHL:
3721      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3722      break;
3723
3724   case TGSI_OPCODE_AND:
3725      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3726      break;
3727
3728   case TGSI_OPCODE_OR:
3729      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3730      break;
3731
3732   case TGSI_OPCODE_MOD:
3733      exec_vector_binary(mach, inst, micro_mod, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3734      break;
3735
3736   case TGSI_OPCODE_XOR:
3737      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3738      break;
3739
3740   case TGSI_OPCODE_SAD:
3741      assert (0);
3742      break;
3743
3744   case TGSI_OPCODE_TXF:
3745      assert (0);
3746      break;
3747
3748   case TGSI_OPCODE_TXQ:
3749      exec_txq(mach, inst);
3750      break;
3751
3752   case TGSI_OPCODE_EMIT:
3753      emit_vertex(mach);
3754      break;
3755
3756   case TGSI_OPCODE_ENDPRIM:
3757      emit_primitive(mach);
3758      break;
3759
3760   case TGSI_OPCODE_BGNLOOP:
3761      /* push LoopMask and ContMasks */
3762      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3763      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3764      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3765      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3766
3767      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3768      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3769      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3770      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3771      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
3772      break;
3773
3774   case TGSI_OPCODE_ENDLOOP:
3775      /* Restore ContMask, but don't pop */
3776      assert(mach->ContStackTop > 0);
3777      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3778      UPDATE_EXEC_MASK(mach);
3779      if (mach->ExecMask) {
3780         /* repeat loop: jump to instruction just past BGNLOOP */
3781         assert(mach->LoopLabelStackTop > 0);
3782         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3783      }
3784      else {
3785         /* exit loop: pop LoopMask */
3786         assert(mach->LoopStackTop > 0);
3787         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3788         /* pop ContMask */
3789         assert(mach->ContStackTop > 0);
3790         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3791         assert(mach->LoopLabelStackTop > 0);
3792         --mach->LoopLabelStackTop;
3793
3794         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3795      }
3796      UPDATE_EXEC_MASK(mach);
3797      break;
3798
3799   case TGSI_OPCODE_BRK:
3800      exec_break(mach);
3801      break;
3802
3803   case TGSI_OPCODE_CONT:
3804      /* turn off cont channels for each enabled exec channel */
3805      mach->ContMask &= ~mach->ExecMask;
3806      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3807      UPDATE_EXEC_MASK(mach);
3808      break;
3809
3810   case TGSI_OPCODE_BGNSUB:
3811      /* no-op */
3812      break;
3813
3814   case TGSI_OPCODE_ENDSUB:
3815      /*
3816       * XXX: This really should be a no-op. We should never reach this opcode.
3817       */
3818
3819      assert(mach->CallStackTop > 0);
3820      mach->CallStackTop--;
3821
3822      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3823      mach->CondMask = mach->CondStack[mach->CondStackTop];
3824
3825      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3826      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3827
3828      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3829      mach->ContMask = mach->ContStack[mach->ContStackTop];
3830
3831      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3832      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3833
3834      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3835      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3836
3837      assert(mach->FuncStackTop > 0);
3838      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3839
3840      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3841
3842      UPDATE_EXEC_MASK(mach);
3843      break;
3844
3845   case TGSI_OPCODE_NOP:
3846      break;
3847
3848   case TGSI_OPCODE_BREAKC:
3849      FETCH(&r[0], 0, CHAN_X);
3850      /* update CondMask */
3851      if (r[0].u[0] && (mach->ExecMask & 0x1)) {
3852         mach->LoopMask &= ~0x1;
3853      }
3854      if (r[0].u[1] && (mach->ExecMask & 0x2)) {
3855         mach->LoopMask &= ~0x2;
3856      }
3857      if (r[0].u[2] && (mach->ExecMask & 0x4)) {
3858         mach->LoopMask &= ~0x4;
3859      }
3860      if (r[0].u[3] && (mach->ExecMask & 0x8)) {
3861         mach->LoopMask &= ~0x8;
3862      }
3863      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3864      UPDATE_EXEC_MASK(mach);
3865      break;
3866
3867   case TGSI_OPCODE_F2I:
3868      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3869      break;
3870
3871   case TGSI_OPCODE_IDIV:
3872      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3873      break;
3874
3875   case TGSI_OPCODE_IMAX:
3876      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3877      break;
3878
3879   case TGSI_OPCODE_IMIN:
3880      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3881      break;
3882
3883   case TGSI_OPCODE_INEG:
3884      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3885      break;
3886
3887   case TGSI_OPCODE_ISGE:
3888      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3889      break;
3890
3891   case TGSI_OPCODE_ISHR:
3892      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3893      break;
3894
3895   case TGSI_OPCODE_ISLT:
3896      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3897      break;
3898
3899   case TGSI_OPCODE_F2U:
3900      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3901      break;
3902
3903   case TGSI_OPCODE_U2F:
3904      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
3905      break;
3906
3907   case TGSI_OPCODE_UADD:
3908      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3909      break;
3910
3911   case TGSI_OPCODE_UDIV:
3912      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3913      break;
3914
3915   case TGSI_OPCODE_UMAD:
3916      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3917      break;
3918
3919   case TGSI_OPCODE_UMAX:
3920      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3921      break;
3922
3923   case TGSI_OPCODE_UMIN:
3924      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3925      break;
3926
3927   case TGSI_OPCODE_UMOD:
3928      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3929      break;
3930
3931   case TGSI_OPCODE_UMUL:
3932      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3933      break;
3934
3935   case TGSI_OPCODE_USEQ:
3936      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3937      break;
3938
3939   case TGSI_OPCODE_USGE:
3940      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3941      break;
3942
3943   case TGSI_OPCODE_USHR:
3944      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3945      break;
3946
3947   case TGSI_OPCODE_USLT:
3948      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3949      break;
3950
3951   case TGSI_OPCODE_USNE:
3952      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3953      break;
3954
3955   case TGSI_OPCODE_SWITCH:
3956      exec_switch(mach, inst);
3957      break;
3958
3959   case TGSI_OPCODE_CASE:
3960      exec_case(mach, inst);
3961      break;
3962
3963   case TGSI_OPCODE_DEFAULT:
3964      exec_default(mach);
3965      break;
3966
3967   case TGSI_OPCODE_ENDSWITCH:
3968      exec_endswitch(mach);
3969      break;
3970
3971   case TGSI_OPCODE_LOAD:
3972      assert(0);
3973      break;
3974
3975   case TGSI_OPCODE_LOAD_MS:
3976      assert(0);
3977      break;
3978
3979   case TGSI_OPCODE_SAMPLE:
3980      exec_sample(mach, inst, TEX_MODIFIER_NONE);
3981      break;
3982
3983   case TGSI_OPCODE_SAMPLE_B:
3984      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
3985      break;
3986
3987   case TGSI_OPCODE_SAMPLE_C:
3988      exec_sample(mach, inst, TEX_MODIFIER_NONE);
3989      break;
3990
3991   case TGSI_OPCODE_SAMPLE_C_LZ:
3992      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
3993      break;
3994
3995   case TGSI_OPCODE_SAMPLE_D:
3996      exec_sample_d(mach, inst);
3997      break;
3998
3999   case TGSI_OPCODE_SAMPLE_L:
4000      exec_sample(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
4001      break;
4002
4003   case TGSI_OPCODE_GATHER4:
4004      assert(0);
4005      break;
4006
4007   case TGSI_OPCODE_RESINFO:
4008      assert(0);
4009      break;
4010
4011   case TGSI_OPCODE_SAMPLE_POS:
4012      assert(0);
4013      break;
4014
4015   case TGSI_OPCODE_SAMPLE_INFO:
4016      assert(0);
4017      break;
4018
4019   default:
4020      assert( 0 );
4021   }
4022}
4023
4024
4025#define DEBUG_EXECUTION 0
4026
4027
4028/**
4029 * Run TGSI interpreter.
4030 * \return bitmask of "alive" quad components
4031 */
4032uint
4033tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
4034{
4035   uint i;
4036   int pc = 0;
4037
4038   mach->CondMask = 0xf;
4039   mach->LoopMask = 0xf;
4040   mach->ContMask = 0xf;
4041   mach->FuncMask = 0xf;
4042   mach->ExecMask = 0xf;
4043
4044   mach->Switch.mask = 0xf;
4045
4046   assert(mach->CondStackTop == 0);
4047   assert(mach->LoopStackTop == 0);
4048   assert(mach->ContStackTop == 0);
4049   assert(mach->SwitchStackTop == 0);
4050   assert(mach->BreakStackTop == 0);
4051   assert(mach->CallStackTop == 0);
4052
4053   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
4054   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
4055
4056   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
4057      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
4058      mach->Primitives[0] = 0;
4059   }
4060
4061   /* execute declarations (interpolants) */
4062   for (i = 0; i < mach->NumDeclarations; i++) {
4063      exec_declaration( mach, mach->Declarations+i );
4064   }
4065
4066   {
4067#if DEBUG_EXECUTION
4068      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
4069      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
4070      uint inst = 1;
4071
4072      memcpy(temps, mach->Temps, sizeof(temps));
4073      memcpy(outputs, mach->Outputs, sizeof(outputs));
4074#endif
4075
4076      /* execute instructions, until pc is set to -1 */
4077      while (pc != -1) {
4078
4079#if DEBUG_EXECUTION
4080         uint i;
4081
4082         tgsi_dump_instruction(&mach->Instructions[pc], inst++);
4083#endif
4084
4085         assert(pc < (int) mach->NumInstructions);
4086         exec_instruction(mach, mach->Instructions + pc, &pc);
4087
4088#if DEBUG_EXECUTION
4089         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
4090            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
4091               uint j;
4092
4093               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
4094               debug_printf("TEMP[%2u] = ", i);
4095               for (j = 0; j < 4; j++) {
4096                  if (j > 0) {
4097                     debug_printf("           ");
4098                  }
4099                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
4100                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
4101                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
4102                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
4103                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
4104               }
4105            }
4106         }
4107         for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
4108            if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
4109               uint j;
4110
4111               memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
4112               debug_printf("OUT[%2u] =  ", i);
4113               for (j = 0; j < 4; j++) {
4114                  if (j > 0) {
4115                     debug_printf("           ");
4116                  }
4117                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
4118                               outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
4119                               outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
4120                               outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
4121                               outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
4122               }
4123            }
4124         }
4125#endif
4126      }
4127   }
4128
4129#if 0
4130   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
4131   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
4132      /*
4133       * Scale back depth component.
4134       */
4135      for (i = 0; i < 4; i++)
4136         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
4137   }
4138#endif
4139
4140   /* Strictly speaking, these assertions aren't really needed but they
4141    * can potentially catch some bugs in the control flow code.
4142    */
4143   assert(mach->CondStackTop == 0);
4144   assert(mach->LoopStackTop == 0);
4145   assert(mach->ContStackTop == 0);
4146   assert(mach->SwitchStackTop == 0);
4147   assert(mach->BreakStackTop == 0);
4148   assert(mach->CallStackTop == 0);
4149
4150   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4151}
4152