tgsi_exec.c revision 4bfe1c955fe679547c8a03119d1681e33593c768
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29/**
30 * TGSI interpreter/executor.
31 *
32 * Flow control information:
33 *
34 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
35 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
36 * care since a condition may be true for some quad components but false
37 * for other components.
38 *
39 * We basically execute all statements (even if they're in the part of
40 * an IF/ELSE clause that's "not taken") and use a special mask to
41 * control writing to destination registers.  This is the ExecMask.
42 * See store_dest().
43 *
44 * The ExecMask is computed from three other masks (CondMask, LoopMask and
45 * ContMask) which are controlled by the flow control instructions (namely:
46 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
47 *
48 *
49 * Authors:
50 *   Michal Krol
51 *   Brian Paul
52 */
53
54#include "pipe/p_compiler.h"
55#include "pipe/p_state.h"
56#include "pipe/p_shader_tokens.h"
57#include "tgsi/tgsi_dump.h"
58#include "tgsi/tgsi_parse.h"
59#include "tgsi/tgsi_util.h"
60#include "tgsi_exec.h"
61#include "util/u_memory.h"
62#include "util/u_math.h"
63
64
65#define FAST_MATH 1
66
67#define TILE_TOP_LEFT     0
68#define TILE_TOP_RIGHT    1
69#define TILE_BOTTOM_LEFT  2
70#define TILE_BOTTOM_RIGHT 3
71
72static void
73micro_abs(union tgsi_exec_channel *dst,
74          const union tgsi_exec_channel *src)
75{
76   dst->f[0] = fabsf(src->f[0]);
77   dst->f[1] = fabsf(src->f[1]);
78   dst->f[2] = fabsf(src->f[2]);
79   dst->f[3] = fabsf(src->f[3]);
80}
81
82static void
83micro_arl(union tgsi_exec_channel *dst,
84          const union tgsi_exec_channel *src)
85{
86   dst->i[0] = (int)floorf(src->f[0]);
87   dst->i[1] = (int)floorf(src->f[1]);
88   dst->i[2] = (int)floorf(src->f[2]);
89   dst->i[3] = (int)floorf(src->f[3]);
90}
91
92static void
93micro_arr(union tgsi_exec_channel *dst,
94          const union tgsi_exec_channel *src)
95{
96   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
97   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
98   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
99   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
100}
101
102static void
103micro_ceil(union tgsi_exec_channel *dst,
104           const union tgsi_exec_channel *src)
105{
106   dst->f[0] = ceilf(src->f[0]);
107   dst->f[1] = ceilf(src->f[1]);
108   dst->f[2] = ceilf(src->f[2]);
109   dst->f[3] = ceilf(src->f[3]);
110}
111
112static void
113micro_cos(union tgsi_exec_channel *dst,
114          const union tgsi_exec_channel *src)
115{
116   dst->f[0] = cosf(src->f[0]);
117   dst->f[1] = cosf(src->f[1]);
118   dst->f[2] = cosf(src->f[2]);
119   dst->f[3] = cosf(src->f[3]);
120}
121
122static void
123micro_ddx(union tgsi_exec_channel *dst,
124          const union tgsi_exec_channel *src)
125{
126   dst->f[0] =
127   dst->f[1] =
128   dst->f[2] =
129   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
130}
131
132static void
133micro_ddy(union tgsi_exec_channel *dst,
134          const union tgsi_exec_channel *src)
135{
136   dst->f[0] =
137   dst->f[1] =
138   dst->f[2] =
139   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
140}
141
142static void
143micro_exp2(union tgsi_exec_channel *dst,
144           const union tgsi_exec_channel *src)
145{
146#if FAST_MATH
147   dst->f[0] = util_fast_exp2(src->f[0]);
148   dst->f[1] = util_fast_exp2(src->f[1]);
149   dst->f[2] = util_fast_exp2(src->f[2]);
150   dst->f[3] = util_fast_exp2(src->f[3]);
151#else
152#if DEBUG
153   /* Inf is okay for this instruction, so clamp it to silence assertions. */
154   uint i;
155   union tgsi_exec_channel clamped;
156
157   for (i = 0; i < 4; i++) {
158      if (src->f[i] > 127.99999f) {
159         clamped.f[i] = 127.99999f;
160      } else if (src->f[i] < -126.99999f) {
161         clamped.f[i] = -126.99999f;
162      } else {
163         clamped.f[i] = src->f[i];
164      }
165   }
166   src = &clamped;
167#endif /* DEBUG */
168
169   dst->f[0] = powf(2.0f, src->f[0]);
170   dst->f[1] = powf(2.0f, src->f[1]);
171   dst->f[2] = powf(2.0f, src->f[2]);
172   dst->f[3] = powf(2.0f, src->f[3]);
173#endif /* FAST_MATH */
174}
175
176static void
177micro_flr(union tgsi_exec_channel *dst,
178          const union tgsi_exec_channel *src)
179{
180   dst->f[0] = floorf(src->f[0]);
181   dst->f[1] = floorf(src->f[1]);
182   dst->f[2] = floorf(src->f[2]);
183   dst->f[3] = floorf(src->f[3]);
184}
185
186static void
187micro_frc(union tgsi_exec_channel *dst,
188          const union tgsi_exec_channel *src)
189{
190   dst->f[0] = src->f[0] - floorf(src->f[0]);
191   dst->f[1] = src->f[1] - floorf(src->f[1]);
192   dst->f[2] = src->f[2] - floorf(src->f[2]);
193   dst->f[3] = src->f[3] - floorf(src->f[3]);
194}
195
196static void
197micro_iabs(union tgsi_exec_channel *dst,
198           const union tgsi_exec_channel *src)
199{
200   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
201   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
202   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
203   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
204}
205
206static void
207micro_ineg(union tgsi_exec_channel *dst,
208           const union tgsi_exec_channel *src)
209{
210   dst->i[0] = -src->i[0];
211   dst->i[1] = -src->i[1];
212   dst->i[2] = -src->i[2];
213   dst->i[3] = -src->i[3];
214}
215
216static void
217micro_lg2(union tgsi_exec_channel *dst,
218          const union tgsi_exec_channel *src)
219{
220#if FAST_MATH
221   dst->f[0] = util_fast_log2(src->f[0]);
222   dst->f[1] = util_fast_log2(src->f[1]);
223   dst->f[2] = util_fast_log2(src->f[2]);
224   dst->f[3] = util_fast_log2(src->f[3]);
225#else
226   dst->f[0] = logf(src->f[0]) * 1.442695f;
227   dst->f[1] = logf(src->f[1]) * 1.442695f;
228   dst->f[2] = logf(src->f[2]) * 1.442695f;
229   dst->f[3] = logf(src->f[3]) * 1.442695f;
230#endif
231}
232
233static void
234micro_lrp(union tgsi_exec_channel *dst,
235          const union tgsi_exec_channel *src)
236{
237   dst->f[0] = src[0].f[0] * (src[1].f[0] - src[2].f[0]) + src[2].f[0];
238   dst->f[1] = src[0].f[1] * (src[1].f[1] - src[2].f[1]) + src[2].f[1];
239   dst->f[2] = src[0].f[2] * (src[1].f[2] - src[2].f[2]) + src[2].f[2];
240   dst->f[3] = src[0].f[3] * (src[1].f[3] - src[2].f[3]) + src[2].f[3];
241}
242
243static void
244micro_mad(union tgsi_exec_channel *dst,
245          const union tgsi_exec_channel *src)
246{
247   dst->f[0] = src[0].f[0] * src[1].f[0] + src[2].f[0];
248   dst->f[1] = src[0].f[1] * src[1].f[1] + src[2].f[1];
249   dst->f[2] = src[0].f[2] * src[1].f[2] + src[2].f[2];
250   dst->f[3] = src[0].f[3] * src[1].f[3] + src[2].f[3];
251}
252
253static void
254micro_mov(union tgsi_exec_channel *dst,
255          const union tgsi_exec_channel *src)
256{
257   dst->u[0] = src->u[0];
258   dst->u[1] = src->u[1];
259   dst->u[2] = src->u[2];
260   dst->u[3] = src->u[3];
261}
262
263static void
264micro_rcp(union tgsi_exec_channel *dst,
265          const union tgsi_exec_channel *src)
266{
267   dst->f[0] = 1.0f / src->f[0];
268   dst->f[1] = 1.0f / src->f[1];
269   dst->f[2] = 1.0f / src->f[2];
270   dst->f[3] = 1.0f / src->f[3];
271}
272
273static void
274micro_rnd(union tgsi_exec_channel *dst,
275          const union tgsi_exec_channel *src)
276{
277   dst->f[0] = floorf(src->f[0] + 0.5f);
278   dst->f[1] = floorf(src->f[1] + 0.5f);
279   dst->f[2] = floorf(src->f[2] + 0.5f);
280   dst->f[3] = floorf(src->f[3] + 0.5f);
281}
282
283static void
284micro_rsq(union tgsi_exec_channel *dst,
285          const union tgsi_exec_channel *src)
286{
287   dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
288   dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
289   dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
290   dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
291}
292
293static void
294micro_seq(union tgsi_exec_channel *dst,
295          const union tgsi_exec_channel *src)
296{
297   dst->f[0] = src[0].f[0] == src[1].f[0] ? 1.0f : 0.0f;
298   dst->f[1] = src[0].f[1] == src[1].f[1] ? 1.0f : 0.0f;
299   dst->f[2] = src[0].f[2] == src[1].f[2] ? 1.0f : 0.0f;
300   dst->f[3] = src[0].f[3] == src[1].f[3] ? 1.0f : 0.0f;
301}
302
303static void
304micro_sge(union tgsi_exec_channel *dst,
305          const union tgsi_exec_channel *src)
306{
307   dst->f[0] = src[0].f[0] >= src[1].f[0] ? 1.0f : 0.0f;
308   dst->f[1] = src[0].f[1] >= src[1].f[1] ? 1.0f : 0.0f;
309   dst->f[2] = src[0].f[2] >= src[1].f[2] ? 1.0f : 0.0f;
310   dst->f[3] = src[0].f[3] >= src[1].f[3] ? 1.0f : 0.0f;
311}
312
313static void
314micro_sgn(union tgsi_exec_channel *dst,
315          const union tgsi_exec_channel *src)
316{
317   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
318   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
319   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
320   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
321}
322
323static void
324micro_sgt(union tgsi_exec_channel *dst,
325          const union tgsi_exec_channel *src)
326{
327   dst->f[0] = src[0].f[0] > src[1].f[0] ? 1.0f : 0.0f;
328   dst->f[1] = src[0].f[1] > src[1].f[1] ? 1.0f : 0.0f;
329   dst->f[2] = src[0].f[2] > src[1].f[2] ? 1.0f : 0.0f;
330   dst->f[3] = src[0].f[3] > src[1].f[3] ? 1.0f : 0.0f;
331}
332
333static void
334micro_sin(union tgsi_exec_channel *dst,
335          const union tgsi_exec_channel *src)
336{
337   dst->f[0] = sinf(src->f[0]);
338   dst->f[1] = sinf(src->f[1]);
339   dst->f[2] = sinf(src->f[2]);
340   dst->f[3] = sinf(src->f[3]);
341}
342
343static void
344micro_sle(union tgsi_exec_channel *dst,
345          const union tgsi_exec_channel *src)
346{
347   dst->f[0] = src[0].f[0] <= src[1].f[0] ? 1.0f : 0.0f;
348   dst->f[1] = src[0].f[1] <= src[1].f[1] ? 1.0f : 0.0f;
349   dst->f[2] = src[0].f[2] <= src[1].f[2] ? 1.0f : 0.0f;
350   dst->f[3] = src[0].f[3] <= src[1].f[3] ? 1.0f : 0.0f;
351}
352
353static void
354micro_slt(union tgsi_exec_channel *dst,
355          const union tgsi_exec_channel *src)
356{
357   dst->f[0] = src[0].f[0] < src[1].f[0] ? 1.0f : 0.0f;
358   dst->f[1] = src[0].f[1] < src[1].f[1] ? 1.0f : 0.0f;
359   dst->f[2] = src[0].f[2] < src[1].f[2] ? 1.0f : 0.0f;
360   dst->f[3] = src[0].f[3] < src[1].f[3] ? 1.0f : 0.0f;
361}
362
363static void
364micro_sne(union tgsi_exec_channel *dst,
365          const union tgsi_exec_channel *src)
366{
367   dst->f[0] = src[0].f[0] != src[1].f[0] ? 1.0f : 0.0f;
368   dst->f[1] = src[0].f[1] != src[1].f[1] ? 1.0f : 0.0f;
369   dst->f[2] = src[0].f[2] != src[1].f[2] ? 1.0f : 0.0f;
370   dst->f[3] = src[0].f[3] != src[1].f[3] ? 1.0f : 0.0f;
371}
372
373static void
374micro_trunc(union tgsi_exec_channel *dst,
375            const union tgsi_exec_channel *src)
376{
377   dst->f[0] = (float)(int)src->f[0];
378   dst->f[1] = (float)(int)src->f[1];
379   dst->f[2] = (float)(int)src->f[2];
380   dst->f[3] = (float)(int)src->f[3];
381}
382
383
384#define CHAN_X  0
385#define CHAN_Y  1
386#define CHAN_Z  2
387#define CHAN_W  3
388
389enum tgsi_exec_datatype {
390   TGSI_EXEC_DATA_FLOAT,
391   TGSI_EXEC_DATA_INT,
392   TGSI_EXEC_DATA_UINT
393};
394
395/*
396 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
397 */
398#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
399#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
400#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
401#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
402#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
403#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
404#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
405#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
406#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
407#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
408#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
409#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
410#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
411#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
412#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
413#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
414#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
415#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
416#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
417#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
418#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
419#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
420#define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
421#define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
422#define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
423#define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
424#define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
425#define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
426#define TEMP_R0            TGSI_EXEC_TEMP_R0
427#define TEMP_P0            TGSI_EXEC_TEMP_P0
428
429#define IS_CHANNEL_ENABLED(INST, CHAN)\
430   ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
431
432#define IS_CHANNEL_ENABLED2(INST, CHAN)\
433   ((INST).Dst[1].Register.WriteMask & (1 << (CHAN)))
434
435#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
436   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
437      if (IS_CHANNEL_ENABLED( INST, CHAN ))
438
439#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
440   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
441      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
442
443
444/** The execution mask depends on the conditional mask and the loop mask */
445#define UPDATE_EXEC_MASK(MACH) \
446      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
447
448
449static const union tgsi_exec_channel ZeroVec =
450   { { 0.0, 0.0, 0.0, 0.0 } };
451
452
453#define CHECK_INF_OR_NAN(chan) do {\
454      assert(!util_is_inf_or_nan((chan)->f[0]));\
455      assert(!util_is_inf_or_nan((chan)->f[1]));\
456      assert(!util_is_inf_or_nan((chan)->f[2]));\
457      assert(!util_is_inf_or_nan((chan)->f[3]));\
458   } while (0)
459
460
461#ifdef DEBUG
462static void
463print_chan(const char *msg, const union tgsi_exec_channel *chan)
464{
465   debug_printf("%s = {%f, %f, %f, %f}\n",
466                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
467}
468#endif
469
470
471#ifdef DEBUG
472static void
473print_temp(const struct tgsi_exec_machine *mach, uint index)
474{
475   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
476   int i;
477   debug_printf("Temp[%u] =\n", index);
478   for (i = 0; i < 4; i++) {
479      debug_printf("  %c: { %f, %f, %f, %f }\n",
480                   "XYZW"[i],
481                   tmp->xyzw[i].f[0],
482                   tmp->xyzw[i].f[1],
483                   tmp->xyzw[i].f[2],
484                   tmp->xyzw[i].f[3]);
485   }
486}
487#endif
488
489
490/**
491 * Check if there's a potential src/dst register data dependency when
492 * using SOA execution.
493 * Example:
494 *   MOV T, T.yxwz;
495 * This would expand into:
496 *   MOV t0, t1;
497 *   MOV t1, t0;
498 *   MOV t2, t3;
499 *   MOV t3, t2;
500 * The second instruction will have the wrong value for t0 if executed as-is.
501 */
502boolean
503tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
504{
505   uint i, chan;
506
507   uint writemask = inst->Dst[0].Register.WriteMask;
508   if (writemask == TGSI_WRITEMASK_X ||
509       writemask == TGSI_WRITEMASK_Y ||
510       writemask == TGSI_WRITEMASK_Z ||
511       writemask == TGSI_WRITEMASK_W ||
512       writemask == TGSI_WRITEMASK_NONE) {
513      /* no chance of data dependency */
514      return FALSE;
515   }
516
517   /* loop over src regs */
518   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
519      if ((inst->Src[i].Register.File ==
520           inst->Dst[0].Register.File) &&
521          (inst->Src[i].Register.Index ==
522           inst->Dst[0].Register.Index)) {
523         /* loop over dest channels */
524         uint channelsWritten = 0x0;
525         FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
526            /* check if we're reading a channel that's been written */
527            uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
528            if (channelsWritten & (1 << swizzle)) {
529               return TRUE;
530            }
531
532            channelsWritten |= (1 << chan);
533         }
534      }
535   }
536   return FALSE;
537}
538
539
540/**
541 * Initialize machine state by expanding tokens to full instructions,
542 * allocating temporary storage, setting up constants, etc.
543 * After this, we can call tgsi_exec_machine_run() many times.
544 */
545void
546tgsi_exec_machine_bind_shader(
547   struct tgsi_exec_machine *mach,
548   const struct tgsi_token *tokens,
549   uint numSamplers,
550   struct tgsi_sampler **samplers)
551{
552   uint k;
553   struct tgsi_parse_context parse;
554   struct tgsi_exec_labels *labels = &mach->Labels;
555   struct tgsi_full_instruction *instructions;
556   struct tgsi_full_declaration *declarations;
557   uint maxInstructions = 10, numInstructions = 0;
558   uint maxDeclarations = 10, numDeclarations = 0;
559   uint instno = 0;
560
561#if 0
562   tgsi_dump(tokens, 0);
563#endif
564
565   util_init_math();
566
567   mach->Tokens = tokens;
568   mach->Samplers = samplers;
569
570   k = tgsi_parse_init (&parse, mach->Tokens);
571   if (k != TGSI_PARSE_OK) {
572      debug_printf( "Problem parsing!\n" );
573      return;
574   }
575
576   mach->Processor = parse.FullHeader.Processor.Processor;
577   mach->ImmLimit = 0;
578   labels->count = 0;
579
580   declarations = (struct tgsi_full_declaration *)
581      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
582
583   if (!declarations) {
584      return;
585   }
586
587   instructions = (struct tgsi_full_instruction *)
588      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
589
590   if (!instructions) {
591      FREE( declarations );
592      return;
593   }
594
595   while( !tgsi_parse_end_of_tokens( &parse ) ) {
596      uint pointer = parse.Position;
597      uint i;
598
599      tgsi_parse_token( &parse );
600      switch( parse.FullToken.Token.Type ) {
601      case TGSI_TOKEN_TYPE_DECLARATION:
602         /* save expanded declaration */
603         if (numDeclarations == maxDeclarations) {
604            declarations = REALLOC(declarations,
605                                   maxDeclarations
606                                   * sizeof(struct tgsi_full_declaration),
607                                   (maxDeclarations + 10)
608                                   * sizeof(struct tgsi_full_declaration));
609            maxDeclarations += 10;
610         }
611         if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
612            unsigned reg;
613            for (reg = parse.FullToken.FullDeclaration.Range.First;
614                 reg <= parse.FullToken.FullDeclaration.Range.Last;
615                 ++reg) {
616               ++mach->NumOutputs;
617            }
618         }
619         memcpy(declarations + numDeclarations,
620                &parse.FullToken.FullDeclaration,
621                sizeof(declarations[0]));
622         numDeclarations++;
623         break;
624
625      case TGSI_TOKEN_TYPE_IMMEDIATE:
626         {
627            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
628            assert( size <= 4 );
629            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
630
631            for( i = 0; i < size; i++ ) {
632               mach->Imms[mach->ImmLimit][i] =
633		  parse.FullToken.FullImmediate.u[i].Float;
634            }
635            mach->ImmLimit += 1;
636         }
637         break;
638
639      case TGSI_TOKEN_TYPE_INSTRUCTION:
640         assert( labels->count < MAX_LABELS );
641
642         labels->labels[labels->count][0] = instno;
643         labels->labels[labels->count][1] = pointer;
644         labels->count++;
645
646         /* save expanded instruction */
647         if (numInstructions == maxInstructions) {
648            instructions = REALLOC(instructions,
649                                   maxInstructions
650                                   * sizeof(struct tgsi_full_instruction),
651                                   (maxInstructions + 10)
652                                   * sizeof(struct tgsi_full_instruction));
653            maxInstructions += 10;
654         }
655
656         memcpy(instructions + numInstructions,
657                &parse.FullToken.FullInstruction,
658                sizeof(instructions[0]));
659
660         numInstructions++;
661         break;
662
663      case TGSI_TOKEN_TYPE_PROPERTY:
664         break;
665
666      default:
667         assert( 0 );
668      }
669   }
670   tgsi_parse_free (&parse);
671
672   if (mach->Declarations) {
673      FREE( mach->Declarations );
674   }
675   mach->Declarations = declarations;
676   mach->NumDeclarations = numDeclarations;
677
678   if (mach->Instructions) {
679      FREE( mach->Instructions );
680   }
681   mach->Instructions = instructions;
682   mach->NumInstructions = numInstructions;
683}
684
685
686struct tgsi_exec_machine *
687tgsi_exec_machine_create( void )
688{
689   struct tgsi_exec_machine *mach;
690   uint i;
691
692   mach = align_malloc( sizeof *mach, 16 );
693   if (!mach)
694      goto fail;
695
696   memset(mach, 0, sizeof(*mach));
697
698   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
699   mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
700   mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
701
702   /* Setup constants. */
703   for( i = 0; i < 4; i++ ) {
704      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
705      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
706      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
707      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
708      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
709      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
710      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
711      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
712      mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
713      mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
714   }
715
716#ifdef DEBUG
717   /* silence warnings */
718   (void) print_chan;
719   (void) print_temp;
720#endif
721
722   return mach;
723
724fail:
725   align_free(mach);
726   return NULL;
727}
728
729
730void
731tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
732{
733   if (mach) {
734      FREE(mach->Instructions);
735      FREE(mach->Declarations);
736   }
737
738   align_free(mach);
739}
740
741static void
742micro_add(
743   union tgsi_exec_channel *dst,
744   const union tgsi_exec_channel *src0,
745   const union tgsi_exec_channel *src1 )
746{
747   dst->f[0] = src0->f[0] + src1->f[0];
748   dst->f[1] = src0->f[1] + src1->f[1];
749   dst->f[2] = src0->f[2] + src1->f[2];
750   dst->f[3] = src0->f[3] + src1->f[3];
751}
752
753static void
754micro_div(
755   union tgsi_exec_channel *dst,
756   const union tgsi_exec_channel *src0,
757   const union tgsi_exec_channel *src1 )
758{
759   if (src1->f[0] != 0) {
760      dst->f[0] = src0->f[0] / src1->f[0];
761   }
762   if (src1->f[1] != 0) {
763      dst->f[1] = src0->f[1] / src1->f[1];
764   }
765   if (src1->f[2] != 0) {
766      dst->f[2] = src0->f[2] / src1->f[2];
767   }
768   if (src1->f[3] != 0) {
769      dst->f[3] = src0->f[3] / src1->f[3];
770   }
771}
772
773static void
774micro_float_clamp(union tgsi_exec_channel *dst,
775                  const union tgsi_exec_channel *src)
776{
777   uint i;
778
779   for (i = 0; i < 4; i++) {
780      if (src->f[i] > 0.0f) {
781         if (src->f[i] > 1.884467e+019f)
782            dst->f[i] = 1.884467e+019f;
783         else if (src->f[i] < 5.42101e-020f)
784            dst->f[i] = 5.42101e-020f;
785         else
786            dst->f[i] = src->f[i];
787      }
788      else {
789         if (src->f[i] < -1.884467e+019f)
790            dst->f[i] = -1.884467e+019f;
791         else if (src->f[i] > -5.42101e-020f)
792            dst->f[i] = -5.42101e-020f;
793         else
794            dst->f[i] = src->f[i];
795      }
796   }
797}
798
799static void
800micro_lt(
801   union tgsi_exec_channel *dst,
802   const union tgsi_exec_channel *src0,
803   const union tgsi_exec_channel *src1,
804   const union tgsi_exec_channel *src2,
805   const union tgsi_exec_channel *src3 )
806{
807   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
808   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
809   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
810   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
811}
812
813static void
814micro_max(
815   union tgsi_exec_channel *dst,
816   const union tgsi_exec_channel *src0,
817   const union tgsi_exec_channel *src1 )
818{
819   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
820   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
821   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
822   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
823}
824
825static void
826micro_min(
827   union tgsi_exec_channel *dst,
828   const union tgsi_exec_channel *src0,
829   const union tgsi_exec_channel *src1 )
830{
831   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
832   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
833   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
834   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
835}
836
837static void
838micro_mul(
839   union tgsi_exec_channel *dst,
840   const union tgsi_exec_channel *src0,
841   const union tgsi_exec_channel *src1 )
842{
843   dst->f[0] = src0->f[0] * src1->f[0];
844   dst->f[1] = src0->f[1] * src1->f[1];
845   dst->f[2] = src0->f[2] * src1->f[2];
846   dst->f[3] = src0->f[3] * src1->f[3];
847}
848
849#if 0
850static void
851micro_imul64(
852   union tgsi_exec_channel *dst0,
853   union tgsi_exec_channel *dst1,
854   const union tgsi_exec_channel *src0,
855   const union tgsi_exec_channel *src1 )
856{
857   dst1->i[0] = src0->i[0] * src1->i[0];
858   dst1->i[1] = src0->i[1] * src1->i[1];
859   dst1->i[2] = src0->i[2] * src1->i[2];
860   dst1->i[3] = src0->i[3] * src1->i[3];
861   dst0->i[0] = 0;
862   dst0->i[1] = 0;
863   dst0->i[2] = 0;
864   dst0->i[3] = 0;
865}
866#endif
867
868#if 0
869static void
870micro_umul64(
871   union tgsi_exec_channel *dst0,
872   union tgsi_exec_channel *dst1,
873   const union tgsi_exec_channel *src0,
874   const union tgsi_exec_channel *src1 )
875{
876   dst1->u[0] = src0->u[0] * src1->u[0];
877   dst1->u[1] = src0->u[1] * src1->u[1];
878   dst1->u[2] = src0->u[2] * src1->u[2];
879   dst1->u[3] = src0->u[3] * src1->u[3];
880   dst0->u[0] = 0;
881   dst0->u[1] = 0;
882   dst0->u[2] = 0;
883   dst0->u[3] = 0;
884}
885#endif
886
887
888#if 0
889static void
890micro_movc(
891   union tgsi_exec_channel *dst,
892   const union tgsi_exec_channel *src0,
893   const union tgsi_exec_channel *src1,
894   const union tgsi_exec_channel *src2 )
895{
896   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
897   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
898   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
899   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
900}
901#endif
902
903static void
904micro_neg(
905   union tgsi_exec_channel *dst,
906   const union tgsi_exec_channel *src )
907{
908   dst->f[0] = -src->f[0];
909   dst->f[1] = -src->f[1];
910   dst->f[2] = -src->f[2];
911   dst->f[3] = -src->f[3];
912}
913
914static void
915micro_pow(
916   union tgsi_exec_channel *dst,
917   const union tgsi_exec_channel *src0,
918   const union tgsi_exec_channel *src1 )
919{
920#if FAST_MATH
921   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
922   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
923   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
924   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
925#else
926   dst->f[0] = powf( src0->f[0], src1->f[0] );
927   dst->f[1] = powf( src0->f[1], src1->f[1] );
928   dst->f[2] = powf( src0->f[2], src1->f[2] );
929   dst->f[3] = powf( src0->f[3], src1->f[3] );
930#endif
931}
932
933static void
934micro_sqrt( union tgsi_exec_channel *dst,
935            const union tgsi_exec_channel *src )
936{
937   dst->f[0] = sqrtf( src->f[0] );
938   dst->f[1] = sqrtf( src->f[1] );
939   dst->f[2] = sqrtf( src->f[2] );
940   dst->f[3] = sqrtf( src->f[3] );
941}
942
943static void
944micro_sub(
945   union tgsi_exec_channel *dst,
946   const union tgsi_exec_channel *src0,
947   const union tgsi_exec_channel *src1 )
948{
949   dst->f[0] = src0->f[0] - src1->f[0];
950   dst->f[1] = src0->f[1] - src1->f[1];
951   dst->f[2] = src0->f[2] - src1->f[2];
952   dst->f[3] = src0->f[3] - src1->f[3];
953}
954
955static void
956fetch_src_file_channel(
957   const struct tgsi_exec_machine *mach,
958   const uint file,
959   const uint swizzle,
960   const union tgsi_exec_channel *index,
961   union tgsi_exec_channel *chan )
962{
963   switch( swizzle ) {
964   case TGSI_SWIZZLE_X:
965   case TGSI_SWIZZLE_Y:
966   case TGSI_SWIZZLE_Z:
967   case TGSI_SWIZZLE_W:
968      switch( file ) {
969      case TGSI_FILE_CONSTANT:
970         assert(mach->Consts);
971         if (index->i[0] < 0)
972            chan->f[0] = 0.0f;
973         else
974            chan->f[0] = mach->Consts[index->i[0]][swizzle];
975         if (index->i[1] < 0)
976            chan->f[1] = 0.0f;
977         else
978            chan->f[1] = mach->Consts[index->i[1]][swizzle];
979         if (index->i[2] < 0)
980            chan->f[2] = 0.0f;
981         else
982            chan->f[2] = mach->Consts[index->i[2]][swizzle];
983         if (index->i[3] < 0)
984            chan->f[3] = 0.0f;
985         else
986            chan->f[3] = mach->Consts[index->i[3]][swizzle];
987         break;
988
989      case TGSI_FILE_INPUT:
990      case TGSI_FILE_SYSTEM_VALUE:
991         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
992         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
993         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
994         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
995         break;
996
997      case TGSI_FILE_TEMPORARY:
998         assert(index->i[0] < TGSI_EXEC_NUM_TEMPS);
999         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
1000         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
1001         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
1002         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
1003         break;
1004
1005      case TGSI_FILE_IMMEDIATE:
1006         assert( index->i[0] < (int) mach->ImmLimit );
1007         chan->f[0] = mach->Imms[index->i[0]][swizzle];
1008         assert( index->i[1] < (int) mach->ImmLimit );
1009         chan->f[1] = mach->Imms[index->i[1]][swizzle];
1010         assert( index->i[2] < (int) mach->ImmLimit );
1011         chan->f[2] = mach->Imms[index->i[2]][swizzle];
1012         assert( index->i[3] < (int) mach->ImmLimit );
1013         chan->f[3] = mach->Imms[index->i[3]][swizzle];
1014         break;
1015
1016      case TGSI_FILE_ADDRESS:
1017         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
1018         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
1019         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
1020         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
1021         break;
1022
1023      case TGSI_FILE_PREDICATE:
1024         assert(index->i[0] < TGSI_EXEC_NUM_PREDS);
1025         assert(index->i[1] < TGSI_EXEC_NUM_PREDS);
1026         assert(index->i[2] < TGSI_EXEC_NUM_PREDS);
1027         assert(index->i[3] < TGSI_EXEC_NUM_PREDS);
1028         chan->u[0] = mach->Predicates[0].xyzw[swizzle].u[0];
1029         chan->u[1] = mach->Predicates[0].xyzw[swizzle].u[1];
1030         chan->u[2] = mach->Predicates[0].xyzw[swizzle].u[2];
1031         chan->u[3] = mach->Predicates[0].xyzw[swizzle].u[3];
1032         break;
1033
1034      case TGSI_FILE_OUTPUT:
1035         /* vertex/fragment output vars can be read too */
1036         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
1037         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
1038         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
1039         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
1040         break;
1041
1042      default:
1043         assert( 0 );
1044      }
1045      break;
1046
1047   default:
1048      assert( 0 );
1049   }
1050}
1051
1052static void
1053fetch_source(const struct tgsi_exec_machine *mach,
1054             union tgsi_exec_channel *chan,
1055             const struct tgsi_full_src_register *reg,
1056             const uint chan_index,
1057             enum tgsi_exec_datatype src_datatype)
1058{
1059   union tgsi_exec_channel index;
1060   uint swizzle;
1061
1062   /* We start with a direct index into a register file.
1063    *
1064    *    file[1],
1065    *    where:
1066    *       file = Register.File
1067    *       [1] = Register.Index
1068    */
1069   index.i[0] =
1070   index.i[1] =
1071   index.i[2] =
1072   index.i[3] = reg->Register.Index;
1073
1074   /* There is an extra source register that indirectly subscripts
1075    * a register file. The direct index now becomes an offset
1076    * that is being added to the indirect register.
1077    *
1078    *    file[ind[2].x+1],
1079    *    where:
1080    *       ind = Indirect.File
1081    *       [2] = Indirect.Index
1082    *       .x = Indirect.SwizzleX
1083    */
1084   if (reg->Register.Indirect) {
1085      union tgsi_exec_channel index2;
1086      union tgsi_exec_channel indir_index;
1087      const uint execmask = mach->ExecMask;
1088      uint i;
1089
1090      /* which address register (always zero now) */
1091      index2.i[0] =
1092      index2.i[1] =
1093      index2.i[2] =
1094      index2.i[3] = reg->Indirect.Index;
1095
1096      /* get current value of address register[swizzle] */
1097      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1098      fetch_src_file_channel(
1099         mach,
1100         reg->Indirect.File,
1101         swizzle,
1102         &index2,
1103         &indir_index );
1104
1105      /* add value of address register to the offset */
1106      index.i[0] += indir_index.i[0];
1107      index.i[1] += indir_index.i[1];
1108      index.i[2] += indir_index.i[2];
1109      index.i[3] += indir_index.i[3];
1110
1111      /* for disabled execution channels, zero-out the index to
1112       * avoid using a potential garbage value.
1113       */
1114      for (i = 0; i < QUAD_SIZE; i++) {
1115         if ((execmask & (1 << i)) == 0)
1116            index.i[i] = 0;
1117      }
1118   }
1119
1120   /* There is an extra source register that is a second
1121    * subscript to a register file. Effectively it means that
1122    * the register file is actually a 2D array of registers.
1123    *
1124    *    file[1][3] == file[1*sizeof(file[1])+3],
1125    *    where:
1126    *       [3] = Dimension.Index
1127    */
1128   if (reg->Register.Dimension) {
1129      /* The size of the first-order array depends on the register file type.
1130       * We need to multiply the index to the first array to get an effective,
1131       * "flat" index that points to the beginning of the second-order array.
1132       */
1133      switch (reg->Register.File) {
1134      case TGSI_FILE_INPUT:
1135      case TGSI_FILE_SYSTEM_VALUE:
1136         index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1137         index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1138         index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1139         index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1140         break;
1141      case TGSI_FILE_CONSTANT:
1142         index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
1143         index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
1144         index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
1145         index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
1146         break;
1147      default:
1148         assert( 0 );
1149      }
1150
1151      index.i[0] += reg->Dimension.Index;
1152      index.i[1] += reg->Dimension.Index;
1153      index.i[2] += reg->Dimension.Index;
1154      index.i[3] += reg->Dimension.Index;
1155
1156      /* Again, the second subscript index can be addressed indirectly
1157       * identically to the first one.
1158       * Nothing stops us from indirectly addressing the indirect register,
1159       * but there is no need for that, so we won't exercise it.
1160       *
1161       *    file[1][ind[4].y+3],
1162       *    where:
1163       *       ind = DimIndirect.File
1164       *       [4] = DimIndirect.Index
1165       *       .y = DimIndirect.SwizzleX
1166       */
1167      if (reg->Dimension.Indirect) {
1168         union tgsi_exec_channel index2;
1169         union tgsi_exec_channel indir_index;
1170         const uint execmask = mach->ExecMask;
1171         uint i;
1172
1173         index2.i[0] =
1174         index2.i[1] =
1175         index2.i[2] =
1176         index2.i[3] = reg->DimIndirect.Index;
1177
1178         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1179         fetch_src_file_channel(
1180            mach,
1181            reg->DimIndirect.File,
1182            swizzle,
1183            &index2,
1184            &indir_index );
1185
1186         index.i[0] += indir_index.i[0];
1187         index.i[1] += indir_index.i[1];
1188         index.i[2] += indir_index.i[2];
1189         index.i[3] += indir_index.i[3];
1190
1191         /* for disabled execution channels, zero-out the index to
1192          * avoid using a potential garbage value.
1193          */
1194         for (i = 0; i < QUAD_SIZE; i++) {
1195            if ((execmask & (1 << i)) == 0)
1196               index.i[i] = 0;
1197         }
1198      }
1199
1200      /* If by any chance there was a need for a 3D array of register
1201       * files, we would have to check whether Dimension is followed
1202       * by a dimension register and continue the saga.
1203       */
1204   }
1205
1206   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1207   fetch_src_file_channel(
1208      mach,
1209      reg->Register.File,
1210      swizzle,
1211      &index,
1212      chan );
1213
1214   if (reg->Register.Absolute) {
1215      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1216         micro_abs(chan, chan);
1217      } else {
1218         micro_iabs(chan, chan);
1219      }
1220   }
1221
1222   if (reg->Register.Negate) {
1223      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1224         micro_neg(chan, chan);
1225      } else {
1226         micro_ineg(chan, chan);
1227      }
1228   }
1229}
1230
1231static void
1232store_dest(struct tgsi_exec_machine *mach,
1233           const union tgsi_exec_channel *chan,
1234           const struct tgsi_full_dst_register *reg,
1235           const struct tgsi_full_instruction *inst,
1236           uint chan_index,
1237           enum tgsi_exec_datatype dst_datatype)
1238{
1239   uint i;
1240   union tgsi_exec_channel null;
1241   union tgsi_exec_channel *dst;
1242   uint execmask = mach->ExecMask;
1243   int offset = 0;  /* indirection offset */
1244   int index;
1245
1246   if (dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1247      CHECK_INF_OR_NAN(chan);
1248   }
1249
1250   /* There is an extra source register that indirectly subscripts
1251    * a register file. The direct index now becomes an offset
1252    * that is being added to the indirect register.
1253    *
1254    *    file[ind[2].x+1],
1255    *    where:
1256    *       ind = Indirect.File
1257    *       [2] = Indirect.Index
1258    *       .x = Indirect.SwizzleX
1259    */
1260   if (reg->Register.Indirect) {
1261      union tgsi_exec_channel index;
1262      union tgsi_exec_channel indir_index;
1263      uint swizzle;
1264
1265      /* which address register (always zero for now) */
1266      index.i[0] =
1267      index.i[1] =
1268      index.i[2] =
1269      index.i[3] = reg->Indirect.Index;
1270
1271      /* get current value of address register[swizzle] */
1272      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1273
1274      /* fetch values from the address/indirection register */
1275      fetch_src_file_channel(
1276         mach,
1277         reg->Indirect.File,
1278         swizzle,
1279         &index,
1280         &indir_index );
1281
1282      /* save indirection offset */
1283      offset = indir_index.i[0];
1284   }
1285
1286   switch (reg->Register.File) {
1287   case TGSI_FILE_NULL:
1288      dst = &null;
1289      break;
1290
1291   case TGSI_FILE_OUTPUT:
1292      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1293         + reg->Register.Index;
1294      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1295#if 0
1296      if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1297         fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1298         for (i = 0; i < QUAD_SIZE; i++)
1299            if (execmask & (1 << i))
1300               fprintf(stderr, "%f, ", chan->f[i]);
1301         fprintf(stderr, ")\n");
1302      }
1303#endif
1304      break;
1305
1306   case TGSI_FILE_TEMPORARY:
1307      index = reg->Register.Index;
1308      assert( index < TGSI_EXEC_NUM_TEMPS );
1309      dst = &mach->Temps[offset + index].xyzw[chan_index];
1310      break;
1311
1312   case TGSI_FILE_ADDRESS:
1313      index = reg->Register.Index;
1314      dst = &mach->Addrs[index].xyzw[chan_index];
1315      break;
1316
1317   case TGSI_FILE_LOOP:
1318      assert(reg->Register.Index == 0);
1319      assert(mach->LoopCounterStackTop > 0);
1320      assert(chan_index == CHAN_X);
1321      dst = &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[chan_index];
1322      break;
1323
1324   case TGSI_FILE_PREDICATE:
1325      index = reg->Register.Index;
1326      assert(index < TGSI_EXEC_NUM_PREDS);
1327      dst = &mach->Predicates[index].xyzw[chan_index];
1328      break;
1329
1330   default:
1331      assert( 0 );
1332      return;
1333   }
1334
1335   if (inst->Instruction.Predicate) {
1336      uint swizzle;
1337      union tgsi_exec_channel *pred;
1338
1339      switch (chan_index) {
1340      case CHAN_X:
1341         swizzle = inst->Predicate.SwizzleX;
1342         break;
1343      case CHAN_Y:
1344         swizzle = inst->Predicate.SwizzleY;
1345         break;
1346      case CHAN_Z:
1347         swizzle = inst->Predicate.SwizzleZ;
1348         break;
1349      case CHAN_W:
1350         swizzle = inst->Predicate.SwizzleW;
1351         break;
1352      default:
1353         assert(0);
1354         return;
1355      }
1356
1357      assert(inst->Predicate.Index == 0);
1358
1359      pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1360
1361      if (inst->Predicate.Negate) {
1362         for (i = 0; i < QUAD_SIZE; i++) {
1363            if (pred->u[i]) {
1364               execmask &= ~(1 << i);
1365            }
1366         }
1367      } else {
1368         for (i = 0; i < QUAD_SIZE; i++) {
1369            if (!pred->u[i]) {
1370               execmask &= ~(1 << i);
1371            }
1372         }
1373      }
1374   }
1375
1376   switch (inst->Instruction.Saturate) {
1377   case TGSI_SAT_NONE:
1378      for (i = 0; i < QUAD_SIZE; i++)
1379         if (execmask & (1 << i))
1380            dst->i[i] = chan->i[i];
1381      break;
1382
1383   case TGSI_SAT_ZERO_ONE:
1384      for (i = 0; i < QUAD_SIZE; i++)
1385         if (execmask & (1 << i)) {
1386            if (chan->f[i] < 0.0f)
1387               dst->f[i] = 0.0f;
1388            else if (chan->f[i] > 1.0f)
1389               dst->f[i] = 1.0f;
1390            else
1391               dst->i[i] = chan->i[i];
1392         }
1393      break;
1394
1395   case TGSI_SAT_MINUS_PLUS_ONE:
1396      for (i = 0; i < QUAD_SIZE; i++)
1397         if (execmask & (1 << i)) {
1398            if (chan->f[i] < -1.0f)
1399               dst->f[i] = -1.0f;
1400            else if (chan->f[i] > 1.0f)
1401               dst->f[i] = 1.0f;
1402            else
1403               dst->i[i] = chan->i[i];
1404         }
1405      break;
1406
1407   default:
1408      assert( 0 );
1409   }
1410}
1411
1412#define FETCH(VAL,INDEX,CHAN)\
1413    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1414
1415#define STORE(VAL,INDEX,CHAN)\
1416   store_dest(mach, VAL, &inst->Dst[INDEX], inst, CHAN, TGSI_EXEC_DATA_FLOAT)
1417
1418
1419/**
1420 * Execute ARB-style KIL which is predicated by a src register.
1421 * Kill fragment if any of the four values is less than zero.
1422 */
1423static void
1424exec_kil(struct tgsi_exec_machine *mach,
1425         const struct tgsi_full_instruction *inst)
1426{
1427   uint uniquemask;
1428   uint chan_index;
1429   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1430   union tgsi_exec_channel r[1];
1431
1432   /* This mask stores component bits that were already tested. */
1433   uniquemask = 0;
1434
1435   for (chan_index = 0; chan_index < 4; chan_index++)
1436   {
1437      uint swizzle;
1438      uint i;
1439
1440      /* unswizzle channel */
1441      swizzle = tgsi_util_get_full_src_register_swizzle (
1442                        &inst->Src[0],
1443                        chan_index);
1444
1445      /* check if the component has not been already tested */
1446      if (uniquemask & (1 << swizzle))
1447         continue;
1448      uniquemask |= 1 << swizzle;
1449
1450      FETCH(&r[0], 0, chan_index);
1451      for (i = 0; i < 4; i++)
1452         if (r[0].f[i] < 0.0f)
1453            kilmask |= 1 << i;
1454   }
1455
1456   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1457}
1458
1459/**
1460 * Execute NVIDIA-style KIL which is predicated by a condition code.
1461 * Kill fragment if the condition code is TRUE.
1462 */
1463static void
1464exec_kilp(struct tgsi_exec_machine *mach,
1465          const struct tgsi_full_instruction *inst)
1466{
1467   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1468
1469   /* "unconditional" kil */
1470   kilmask = mach->ExecMask;
1471   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1472}
1473
1474static void
1475emit_vertex(struct tgsi_exec_machine *mach)
1476{
1477   /* FIXME: check for exec mask correctly
1478   unsigned i;
1479   for (i = 0; i < QUAD_SIZE; ++i) {
1480         if ((mach->ExecMask & (1 << i)))
1481   */
1482   if (mach->ExecMask) {
1483      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1484      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1485   }
1486}
1487
1488static void
1489emit_primitive(struct tgsi_exec_machine *mach)
1490{
1491   unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1492   /* FIXME: check for exec mask correctly
1493   unsigned i;
1494   for (i = 0; i < QUAD_SIZE; ++i) {
1495         if ((mach->ExecMask & (1 << i)))
1496   */
1497   if (mach->ExecMask) {
1498      ++(*prim_count);
1499      debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1500      mach->Primitives[*prim_count] = 0;
1501   }
1502}
1503
1504/*
1505 * Fetch a four texture samples using STR texture coordinates.
1506 */
1507static void
1508fetch_texel( struct tgsi_sampler *sampler,
1509             const union tgsi_exec_channel *s,
1510             const union tgsi_exec_channel *t,
1511             const union tgsi_exec_channel *p,
1512             const union tgsi_exec_channel *lodbias,
1513             union tgsi_exec_channel *r,
1514             union tgsi_exec_channel *g,
1515             union tgsi_exec_channel *b,
1516             union tgsi_exec_channel *a )
1517{
1518   uint j;
1519   float rgba[NUM_CHANNELS][QUAD_SIZE];
1520
1521   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias->f, rgba);
1522
1523   for (j = 0; j < 4; j++) {
1524      r->f[j] = rgba[0][j];
1525      g->f[j] = rgba[1][j];
1526      b->f[j] = rgba[2][j];
1527      a->f[j] = rgba[3][j];
1528   }
1529}
1530
1531
1532#define TEX_MODIFIER_NONE           0
1533#define TEX_MODIFIER_PROJECTED      1
1534#define TEX_MODIFIER_LOD_BIAS       2
1535#define TEX_MODIFIER_EXPLICIT_LOD   3
1536
1537
1538static void
1539exec_tex(struct tgsi_exec_machine *mach,
1540         const struct tgsi_full_instruction *inst,
1541         uint modifier)
1542{
1543   const uint unit = inst->Src[1].Register.Index;
1544   union tgsi_exec_channel r[4];
1545   const union tgsi_exec_channel *lodBias = &ZeroVec;
1546   uint chan_index;
1547
1548   if (modifier != TEX_MODIFIER_NONE) {
1549      FETCH(&r[3], 0, CHAN_W);
1550      if (modifier != TEX_MODIFIER_PROJECTED) {
1551         lodBias = &r[3];
1552      }
1553   }
1554
1555   switch (inst->Texture.Texture) {
1556   case TGSI_TEXTURE_1D:
1557   case TGSI_TEXTURE_SHADOW1D:
1558      FETCH(&r[0], 0, CHAN_X);
1559
1560      if (modifier == TEX_MODIFIER_PROJECTED) {
1561         micro_div(&r[0], &r[0], &r[3]);
1562      }
1563
1564      fetch_texel(mach->Samplers[unit],
1565                  &r[0], &ZeroVec, &ZeroVec, lodBias, /* S, T, P, BIAS */
1566                  &r[0], &r[1], &r[2], &r[3]);        /* R, G, B, A */
1567      break;
1568
1569   case TGSI_TEXTURE_2D:
1570   case TGSI_TEXTURE_RECT:
1571   case TGSI_TEXTURE_SHADOW2D:
1572   case TGSI_TEXTURE_SHADOWRECT:
1573      FETCH(&r[0], 0, CHAN_X);
1574      FETCH(&r[1], 0, CHAN_Y);
1575      FETCH(&r[2], 0, CHAN_Z);
1576
1577      if (modifier == TEX_MODIFIER_PROJECTED) {
1578         micro_div(&r[0], &r[0], &r[3]);
1579         micro_div(&r[1], &r[1], &r[3]);
1580         micro_div(&r[2], &r[2], &r[3]);
1581      }
1582
1583      fetch_texel(mach->Samplers[unit],
1584                  &r[0], &r[1], &r[2], lodBias, /* inputs */
1585                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1586      break;
1587
1588   case TGSI_TEXTURE_3D:
1589   case TGSI_TEXTURE_CUBE:
1590      FETCH(&r[0], 0, CHAN_X);
1591      FETCH(&r[1], 0, CHAN_Y);
1592      FETCH(&r[2], 0, CHAN_Z);
1593
1594      if (modifier == TEX_MODIFIER_PROJECTED) {
1595         micro_div(&r[0], &r[0], &r[3]);
1596         micro_div(&r[1], &r[1], &r[3]);
1597         micro_div(&r[2], &r[2], &r[3]);
1598      }
1599
1600      fetch_texel(mach->Samplers[unit],
1601                  &r[0], &r[1], &r[2], lodBias,
1602                  &r[0], &r[1], &r[2], &r[3]);
1603      break;
1604
1605   default:
1606      assert(0);
1607   }
1608
1609   FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1610      STORE(&r[chan_index], 0, chan_index);
1611   }
1612}
1613
1614static void
1615exec_txd(struct tgsi_exec_machine *mach,
1616         const struct tgsi_full_instruction *inst)
1617{
1618   const uint unit = inst->Src[3].Register.Index;
1619   union tgsi_exec_channel r[4];
1620   uint chan_index;
1621
1622   /*
1623    * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1624    */
1625
1626   switch (inst->Texture.Texture) {
1627   case TGSI_TEXTURE_1D:
1628   case TGSI_TEXTURE_SHADOW1D:
1629
1630      FETCH(&r[0], 0, CHAN_X);
1631
1632      fetch_texel(mach->Samplers[unit],
1633                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
1634                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
1635      break;
1636
1637   case TGSI_TEXTURE_2D:
1638   case TGSI_TEXTURE_RECT:
1639   case TGSI_TEXTURE_SHADOW2D:
1640   case TGSI_TEXTURE_SHADOWRECT:
1641
1642      FETCH(&r[0], 0, CHAN_X);
1643      FETCH(&r[1], 0, CHAN_Y);
1644      FETCH(&r[2], 0, CHAN_Z);
1645
1646      fetch_texel(mach->Samplers[unit],
1647                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
1648                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
1649      break;
1650
1651   case TGSI_TEXTURE_3D:
1652   case TGSI_TEXTURE_CUBE:
1653
1654      FETCH(&r[0], 0, CHAN_X);
1655      FETCH(&r[1], 0, CHAN_Y);
1656      FETCH(&r[2], 0, CHAN_Z);
1657
1658      fetch_texel(mach->Samplers[unit],
1659                  &r[0], &r[1], &r[2], &ZeroVec,
1660                  &r[0], &r[1], &r[2], &r[3]);
1661      break;
1662
1663   default:
1664      assert(0);
1665   }
1666
1667   FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1668      STORE(&r[chan_index], 0, chan_index);
1669   }
1670}
1671
1672
1673/**
1674 * Evaluate a constant-valued coefficient at the position of the
1675 * current quad.
1676 */
1677static void
1678eval_constant_coef(
1679   struct tgsi_exec_machine *mach,
1680   unsigned attrib,
1681   unsigned chan )
1682{
1683   unsigned i;
1684
1685   for( i = 0; i < QUAD_SIZE; i++ ) {
1686      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1687   }
1688}
1689
1690/**
1691 * Evaluate a linear-valued coefficient at the position of the
1692 * current quad.
1693 */
1694static void
1695eval_linear_coef(
1696   struct tgsi_exec_machine *mach,
1697   unsigned attrib,
1698   unsigned chan )
1699{
1700   const float x = mach->QuadPos.xyzw[0].f[0];
1701   const float y = mach->QuadPos.xyzw[1].f[0];
1702   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1703   const float dady = mach->InterpCoefs[attrib].dady[chan];
1704   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1705   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1706   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1707   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1708   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1709}
1710
1711/**
1712 * Evaluate a perspective-valued coefficient at the position of the
1713 * current quad.
1714 */
1715static void
1716eval_perspective_coef(
1717   struct tgsi_exec_machine *mach,
1718   unsigned attrib,
1719   unsigned chan )
1720{
1721   const float x = mach->QuadPos.xyzw[0].f[0];
1722   const float y = mach->QuadPos.xyzw[1].f[0];
1723   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1724   const float dady = mach->InterpCoefs[attrib].dady[chan];
1725   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1726   const float *w = mach->QuadPos.xyzw[3].f;
1727   /* divide by W here */
1728   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1729   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1730   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1731   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1732}
1733
1734
1735typedef void (* eval_coef_func)(
1736   struct tgsi_exec_machine *mach,
1737   unsigned attrib,
1738   unsigned chan );
1739
1740static void
1741exec_declaration(struct tgsi_exec_machine *mach,
1742                 const struct tgsi_full_declaration *decl)
1743{
1744   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
1745      if (decl->Declaration.File == TGSI_FILE_INPUT ||
1746          decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1747         uint first, last, mask;
1748
1749         first = decl->Range.First;
1750         last = decl->Range.Last;
1751         mask = decl->Declaration.UsageMask;
1752
1753         if (decl->Semantic.Name == TGSI_SEMANTIC_POSITION) {
1754            assert(decl->Semantic.Index == 0);
1755            assert(first == last);
1756            assert(mask == TGSI_WRITEMASK_XYZW);
1757
1758            mach->Inputs[first] = mach->QuadPos;
1759         } else if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
1760            uint i;
1761
1762            assert(decl->Semantic.Index == 0);
1763            assert(first == last);
1764
1765            for (i = 0; i < QUAD_SIZE; i++) {
1766               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
1767            }
1768         } else {
1769            eval_coef_func eval;
1770            uint i, j;
1771
1772            switch (decl->Declaration.Interpolate) {
1773            case TGSI_INTERPOLATE_CONSTANT:
1774               eval = eval_constant_coef;
1775               break;
1776
1777            case TGSI_INTERPOLATE_LINEAR:
1778               eval = eval_linear_coef;
1779               break;
1780
1781            case TGSI_INTERPOLATE_PERSPECTIVE:
1782               eval = eval_perspective_coef;
1783               break;
1784
1785            default:
1786               assert(0);
1787               return;
1788            }
1789
1790            for (j = 0; j < NUM_CHANNELS; j++) {
1791               if (mask & (1 << j)) {
1792                  for (i = first; i <= last; i++) {
1793                     eval(mach, i, j);
1794                  }
1795               }
1796            }
1797         }
1798      }
1799   }
1800}
1801
1802typedef void (* micro_op)(union tgsi_exec_channel *dst,
1803                          const union tgsi_exec_channel *src);
1804
1805static void
1806exec_scalar_unary(struct tgsi_exec_machine *mach,
1807                  const struct tgsi_full_instruction *inst,
1808                  micro_op op,
1809                  enum tgsi_exec_datatype dst_datatype,
1810                  enum tgsi_exec_datatype src_datatype)
1811{
1812   unsigned int chan;
1813   union tgsi_exec_channel src;
1814   union tgsi_exec_channel dst;
1815
1816   fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
1817   op(&dst, &src);
1818   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1819      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1820         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
1821      }
1822   }
1823}
1824
1825static void
1826exec_vector_unary(struct tgsi_exec_machine *mach,
1827                  const struct tgsi_full_instruction *inst,
1828                  micro_op op,
1829                  enum tgsi_exec_datatype dst_datatype,
1830                  enum tgsi_exec_datatype src_datatype)
1831{
1832   unsigned int chan;
1833   struct tgsi_exec_vector dst;
1834
1835   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1836      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1837         union tgsi_exec_channel src;
1838
1839         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
1840         op(&dst.xyzw[chan], &src);
1841      }
1842   }
1843   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1844      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1845         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1846      }
1847   }
1848}
1849
1850static void
1851exec_vector_binary(struct tgsi_exec_machine *mach,
1852                   const struct tgsi_full_instruction *inst,
1853                   micro_op op,
1854                   enum tgsi_exec_datatype dst_datatype,
1855                   enum tgsi_exec_datatype src_datatype)
1856{
1857   unsigned int chan;
1858   struct tgsi_exec_vector dst;
1859
1860   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1861      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1862         union tgsi_exec_channel src[2];
1863
1864         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
1865         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
1866         op(&dst.xyzw[chan], src);
1867      }
1868   }
1869   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1870      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1871         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1872      }
1873   }
1874}
1875
1876static void
1877exec_vector_trinary(struct tgsi_exec_machine *mach,
1878                    const struct tgsi_full_instruction *inst,
1879                    micro_op op,
1880                    enum tgsi_exec_datatype dst_datatype,
1881                    enum tgsi_exec_datatype src_datatype)
1882{
1883   unsigned int chan;
1884   struct tgsi_exec_vector dst;
1885
1886   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1887      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1888         union tgsi_exec_channel src[3];
1889
1890         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
1891         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
1892         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
1893         op(&dst.xyzw[chan], src);
1894      }
1895   }
1896   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1897      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1898         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1899      }
1900   }
1901}
1902
1903static void
1904exec_break(struct tgsi_exec_machine *mach)
1905{
1906   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
1907      /* turn off loop channels for each enabled exec channel */
1908      mach->LoopMask &= ~mach->ExecMask;
1909      /* Todo: if mach->LoopMask == 0, jump to end of loop */
1910      UPDATE_EXEC_MASK(mach);
1911   } else {
1912      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
1913
1914      mach->Switch.mask = 0x0;
1915
1916      UPDATE_EXEC_MASK(mach);
1917   }
1918}
1919
1920static void
1921exec_switch(struct tgsi_exec_machine *mach,
1922            const struct tgsi_full_instruction *inst)
1923{
1924   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
1925   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
1926
1927   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
1928   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
1929   mach->Switch.mask = 0x0;
1930   mach->Switch.defaultMask = 0x0;
1931
1932   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
1933   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
1934
1935   UPDATE_EXEC_MASK(mach);
1936}
1937
1938static void
1939exec_case(struct tgsi_exec_machine *mach,
1940          const struct tgsi_full_instruction *inst)
1941{
1942   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
1943   union tgsi_exec_channel src;
1944   uint mask = 0;
1945
1946   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
1947
1948   if (mach->Switch.selector.u[0] == src.u[0]) {
1949      mask |= 0x1;
1950   }
1951   if (mach->Switch.selector.u[1] == src.u[1]) {
1952      mask |= 0x2;
1953   }
1954   if (mach->Switch.selector.u[2] == src.u[2]) {
1955      mask |= 0x4;
1956   }
1957   if (mach->Switch.selector.u[3] == src.u[3]) {
1958      mask |= 0x8;
1959   }
1960
1961   mach->Switch.defaultMask |= mask;
1962
1963   mach->Switch.mask |= mask & prevMask;
1964
1965   UPDATE_EXEC_MASK(mach);
1966}
1967
1968static void
1969exec_default(struct tgsi_exec_machine *mach)
1970{
1971   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
1972
1973   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
1974
1975   UPDATE_EXEC_MASK(mach);
1976}
1977
1978static void
1979exec_endswitch(struct tgsi_exec_machine *mach)
1980{
1981   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
1982   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
1983
1984   UPDATE_EXEC_MASK(mach);
1985}
1986
1987static void
1988micro_i2f(union tgsi_exec_channel *dst,
1989          const union tgsi_exec_channel *src)
1990{
1991   dst->f[0] = (float)src->i[0];
1992   dst->f[1] = (float)src->i[1];
1993   dst->f[2] = (float)src->i[2];
1994   dst->f[3] = (float)src->i[3];
1995}
1996
1997static void
1998micro_not(union tgsi_exec_channel *dst,
1999          const union tgsi_exec_channel *src)
2000{
2001   dst->u[0] = ~src->u[0];
2002   dst->u[1] = ~src->u[1];
2003   dst->u[2] = ~src->u[2];
2004   dst->u[3] = ~src->u[3];
2005}
2006
2007static void
2008micro_shl(union tgsi_exec_channel *dst,
2009          const union tgsi_exec_channel *src)
2010{
2011   dst->u[0] = src[0].u[0] << src[1].u[0];
2012   dst->u[1] = src[0].u[1] << src[1].u[1];
2013   dst->u[2] = src[0].u[2] << src[1].u[2];
2014   dst->u[3] = src[0].u[3] << src[1].u[3];
2015}
2016
2017static void
2018micro_and(union tgsi_exec_channel *dst,
2019          const union tgsi_exec_channel *src)
2020{
2021   dst->u[0] = src[0].u[0] & src[1].u[0];
2022   dst->u[1] = src[0].u[1] & src[1].u[1];
2023   dst->u[2] = src[0].u[2] & src[1].u[2];
2024   dst->u[3] = src[0].u[3] & src[1].u[3];
2025}
2026
2027static void
2028micro_or(union tgsi_exec_channel *dst,
2029         const union tgsi_exec_channel *src)
2030{
2031   dst->u[0] = src[0].u[0] | src[1].u[0];
2032   dst->u[1] = src[0].u[1] | src[1].u[1];
2033   dst->u[2] = src[0].u[2] | src[1].u[2];
2034   dst->u[3] = src[0].u[3] | src[1].u[3];
2035}
2036
2037static void
2038micro_xor(union tgsi_exec_channel *dst,
2039          const union tgsi_exec_channel *src)
2040{
2041   dst->u[0] = src[0].u[0] ^ src[1].u[0];
2042   dst->u[1] = src[0].u[1] ^ src[1].u[1];
2043   dst->u[2] = src[0].u[2] ^ src[1].u[2];
2044   dst->u[3] = src[0].u[3] ^ src[1].u[3];
2045}
2046
2047static void
2048micro_f2i(union tgsi_exec_channel *dst,
2049          const union tgsi_exec_channel *src)
2050{
2051   dst->i[0] = (int)src->f[0];
2052   dst->i[1] = (int)src->f[1];
2053   dst->i[2] = (int)src->f[2];
2054   dst->i[3] = (int)src->f[3];
2055}
2056
2057static void
2058micro_idiv(union tgsi_exec_channel *dst,
2059           const union tgsi_exec_channel *src)
2060{
2061   dst->i[0] = src[0].i[0] / src[1].i[0];
2062   dst->i[1] = src[0].i[1] / src[1].i[1];
2063   dst->i[2] = src[0].i[2] / src[1].i[2];
2064   dst->i[3] = src[0].i[3] / src[1].i[3];
2065}
2066
2067static void
2068micro_imax(union tgsi_exec_channel *dst,
2069           const union tgsi_exec_channel *src)
2070{
2071   dst->i[0] = src[0].i[0] > src[1].i[0] ? src[0].i[0] : src[1].i[0];
2072   dst->i[1] = src[0].i[1] > src[1].i[1] ? src[0].i[1] : src[1].i[1];
2073   dst->i[2] = src[0].i[2] > src[1].i[2] ? src[0].i[2] : src[1].i[2];
2074   dst->i[3] = src[0].i[3] > src[1].i[3] ? src[0].i[3] : src[1].i[3];
2075}
2076
2077static void
2078micro_imin(union tgsi_exec_channel *dst,
2079           const union tgsi_exec_channel *src)
2080{
2081   dst->i[0] = src[0].i[0] < src[1].i[0] ? src[0].i[0] : src[1].i[0];
2082   dst->i[1] = src[0].i[1] < src[1].i[1] ? src[0].i[1] : src[1].i[1];
2083   dst->i[2] = src[0].i[2] < src[1].i[2] ? src[0].i[2] : src[1].i[2];
2084   dst->i[3] = src[0].i[3] < src[1].i[3] ? src[0].i[3] : src[1].i[3];
2085}
2086
2087static void
2088micro_isge(union tgsi_exec_channel *dst,
2089           const union tgsi_exec_channel *src)
2090{
2091   dst->i[0] = src[0].i[0] >= src[1].i[0] ? -1 : 0;
2092   dst->i[1] = src[0].i[1] >= src[1].i[1] ? -1 : 0;
2093   dst->i[2] = src[0].i[2] >= src[1].i[2] ? -1 : 0;
2094   dst->i[3] = src[0].i[3] >= src[1].i[3] ? -1 : 0;
2095}
2096
2097static void
2098micro_ishr(union tgsi_exec_channel *dst,
2099           const union tgsi_exec_channel *src)
2100{
2101   dst->i[0] = src[0].i[0] >> src[1].i[0];
2102   dst->i[1] = src[0].i[1] >> src[1].i[1];
2103   dst->i[2] = src[0].i[2] >> src[1].i[2];
2104   dst->i[3] = src[0].i[3] >> src[1].i[3];
2105}
2106
2107static void
2108micro_islt(union tgsi_exec_channel *dst,
2109           const union tgsi_exec_channel *src)
2110{
2111   dst->i[0] = src[0].i[0] < src[1].i[0] ? -1 : 0;
2112   dst->i[1] = src[0].i[1] < src[1].i[1] ? -1 : 0;
2113   dst->i[2] = src[0].i[2] < src[1].i[2] ? -1 : 0;
2114   dst->i[3] = src[0].i[3] < src[1].i[3] ? -1 : 0;
2115}
2116
2117static void
2118micro_f2u(union tgsi_exec_channel *dst,
2119          const union tgsi_exec_channel *src)
2120{
2121   dst->u[0] = (uint)src->f[0];
2122   dst->u[1] = (uint)src->f[1];
2123   dst->u[2] = (uint)src->f[2];
2124   dst->u[3] = (uint)src->f[3];
2125}
2126
2127static void
2128micro_u2f(union tgsi_exec_channel *dst,
2129          const union tgsi_exec_channel *src)
2130{
2131   dst->f[0] = (float)src->u[0];
2132   dst->f[1] = (float)src->u[1];
2133   dst->f[2] = (float)src->u[2];
2134   dst->f[3] = (float)src->u[3];
2135}
2136
2137static void
2138micro_uadd(union tgsi_exec_channel *dst,
2139           const union tgsi_exec_channel *src)
2140{
2141   dst->u[0] = src[0].u[0] + src[1].u[0];
2142   dst->u[1] = src[0].u[1] + src[1].u[1];
2143   dst->u[2] = src[0].u[2] + src[1].u[2];
2144   dst->u[3] = src[0].u[3] + src[1].u[3];
2145}
2146
2147static void
2148micro_udiv(union tgsi_exec_channel *dst,
2149           const union tgsi_exec_channel *src)
2150{
2151   dst->u[0] = src[0].u[0] / src[1].u[0];
2152   dst->u[1] = src[0].u[1] / src[1].u[1];
2153   dst->u[2] = src[0].u[2] / src[1].u[2];
2154   dst->u[3] = src[0].u[3] / src[1].u[3];
2155}
2156
2157static void
2158micro_umad(union tgsi_exec_channel *dst,
2159           const union tgsi_exec_channel *src)
2160{
2161   dst->u[0] = src[0].u[0] * src[1].u[0] + src[2].u[0];
2162   dst->u[1] = src[0].u[1] * src[1].u[1] + src[2].u[1];
2163   dst->u[2] = src[0].u[2] * src[1].u[2] + src[2].u[2];
2164   dst->u[3] = src[0].u[3] * src[1].u[3] + src[2].u[3];
2165}
2166
2167static void
2168micro_umax(union tgsi_exec_channel *dst,
2169           const union tgsi_exec_channel *src)
2170{
2171   dst->u[0] = src[0].u[0] > src[1].u[0] ? src[0].u[0] : src[1].u[0];
2172   dst->u[1] = src[0].u[1] > src[1].u[1] ? src[0].u[1] : src[1].u[1];
2173   dst->u[2] = src[0].u[2] > src[1].u[2] ? src[0].u[2] : src[1].u[2];
2174   dst->u[3] = src[0].u[3] > src[1].u[3] ? src[0].u[3] : src[1].u[3];
2175}
2176
2177static void
2178micro_umin(union tgsi_exec_channel *dst,
2179           const union tgsi_exec_channel *src)
2180{
2181   dst->u[0] = src[0].u[0] < src[1].u[0] ? src[0].u[0] : src[1].u[0];
2182   dst->u[1] = src[0].u[1] < src[1].u[1] ? src[0].u[1] : src[1].u[1];
2183   dst->u[2] = src[0].u[2] < src[1].u[2] ? src[0].u[2] : src[1].u[2];
2184   dst->u[3] = src[0].u[3] < src[1].u[3] ? src[0].u[3] : src[1].u[3];
2185}
2186
2187static void
2188micro_umod(union tgsi_exec_channel *dst,
2189           const union tgsi_exec_channel *src)
2190{
2191   dst->u[0] = src[0].u[0] % src[1].u[0];
2192   dst->u[1] = src[0].u[1] % src[1].u[1];
2193   dst->u[2] = src[0].u[2] % src[1].u[2];
2194   dst->u[3] = src[0].u[3] % src[1].u[3];
2195}
2196
2197static void
2198micro_umul(union tgsi_exec_channel *dst,
2199           const union tgsi_exec_channel *src)
2200{
2201   dst->u[0] = src[0].u[0] * src[1].u[0];
2202   dst->u[1] = src[0].u[1] * src[1].u[1];
2203   dst->u[2] = src[0].u[2] * src[1].u[2];
2204   dst->u[3] = src[0].u[3] * src[1].u[3];
2205}
2206
2207static void
2208micro_useq(union tgsi_exec_channel *dst,
2209           const union tgsi_exec_channel *src)
2210{
2211   dst->u[0] = src[0].u[0] == src[1].u[0] ? ~0 : 0;
2212   dst->u[1] = src[0].u[1] == src[1].u[1] ? ~0 : 0;
2213   dst->u[2] = src[0].u[2] == src[1].u[2] ? ~0 : 0;
2214   dst->u[3] = src[0].u[3] == src[1].u[3] ? ~0 : 0;
2215}
2216
2217static void
2218micro_usge(union tgsi_exec_channel *dst,
2219           const union tgsi_exec_channel *src)
2220{
2221   dst->u[0] = src[0].u[0] >= src[1].u[0] ? ~0 : 0;
2222   dst->u[1] = src[0].u[1] >= src[1].u[1] ? ~0 : 0;
2223   dst->u[2] = src[0].u[2] >= src[1].u[2] ? ~0 : 0;
2224   dst->u[3] = src[0].u[3] >= src[1].u[3] ? ~0 : 0;
2225}
2226
2227static void
2228micro_ushr(union tgsi_exec_channel *dst,
2229           const union tgsi_exec_channel *src)
2230{
2231   dst->u[0] = src[0].u[0] >> src[1].u[0];
2232   dst->u[1] = src[0].u[1] >> src[1].u[1];
2233   dst->u[2] = src[0].u[2] >> src[1].u[2];
2234   dst->u[3] = src[0].u[3] >> src[1].u[3];
2235}
2236
2237static void
2238micro_uslt(union tgsi_exec_channel *dst,
2239           const union tgsi_exec_channel *src)
2240{
2241   dst->u[0] = src[0].u[0] < src[1].u[0] ? ~0 : 0;
2242   dst->u[1] = src[0].u[1] < src[1].u[1] ? ~0 : 0;
2243   dst->u[2] = src[0].u[2] < src[1].u[2] ? ~0 : 0;
2244   dst->u[3] = src[0].u[3] < src[1].u[3] ? ~0 : 0;
2245}
2246
2247static void
2248micro_usne(union tgsi_exec_channel *dst,
2249           const union tgsi_exec_channel *src)
2250{
2251   dst->u[0] = src[0].u[0] != src[1].u[0] ? ~0 : 0;
2252   dst->u[1] = src[0].u[1] != src[1].u[1] ? ~0 : 0;
2253   dst->u[2] = src[0].u[2] != src[1].u[2] ? ~0 : 0;
2254   dst->u[3] = src[0].u[3] != src[1].u[3] ? ~0 : 0;
2255}
2256
2257static void
2258exec_instruction(
2259   struct tgsi_exec_machine *mach,
2260   const struct tgsi_full_instruction *inst,
2261   int *pc )
2262{
2263   uint chan_index;
2264   union tgsi_exec_channel r[10];
2265   union tgsi_exec_channel d[8];
2266
2267   (*pc)++;
2268
2269   switch (inst->Instruction.Opcode) {
2270   case TGSI_OPCODE_ARL:
2271      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
2272      break;
2273
2274   case TGSI_OPCODE_MOV:
2275      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
2276      break;
2277
2278   case TGSI_OPCODE_LIT:
2279      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2280         FETCH( &r[0], 0, CHAN_X );
2281         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2282            micro_max(&d[CHAN_Y], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2283         }
2284
2285         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2286            FETCH( &r[1], 0, CHAN_Y );
2287            micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2288
2289            FETCH( &r[2], 0, CHAN_W );
2290            micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
2291            micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
2292            micro_pow( &r[1], &r[1], &r[2] );
2293            micro_lt(&d[CHAN_Z], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2294         }
2295
2296         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2297            STORE(&d[CHAN_Y], 0, CHAN_Y);
2298         }
2299         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2300            STORE(&d[CHAN_Z], 0, CHAN_Z);
2301         }
2302      }
2303      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2304         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2305      }
2306      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2307         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2308      }
2309      break;
2310
2311   case TGSI_OPCODE_RCP:
2312      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2313      break;
2314
2315   case TGSI_OPCODE_RSQ:
2316      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2317      break;
2318
2319   case TGSI_OPCODE_EXP:
2320      FETCH( &r[0], 0, CHAN_X );
2321      micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
2322      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2323         micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
2324         STORE( &r[2], 0, CHAN_X );        /* store r2 */
2325      }
2326      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2327         micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
2328         STORE( &r[2], 0, CHAN_Y );        /* store r2 */
2329      }
2330      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2331         micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
2332         STORE( &r[2], 0, CHAN_Z );        /* store r2 */
2333      }
2334      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2335         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2336      }
2337      break;
2338
2339   case TGSI_OPCODE_LOG:
2340      FETCH( &r[0], 0, CHAN_X );
2341      micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
2342      micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
2343      micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
2344      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2345         STORE( &r[0], 0, CHAN_X );
2346      }
2347      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2348         micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
2349         micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
2350         STORE( &r[0], 0, CHAN_Y );
2351      }
2352      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2353         STORE( &r[1], 0, CHAN_Z );
2354      }
2355      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2356         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2357      }
2358      break;
2359
2360   case TGSI_OPCODE_MUL:
2361      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2362         FETCH(&r[0], 0, chan_index);
2363         FETCH(&r[1], 1, chan_index);
2364         micro_mul(&d[chan_index], &r[0], &r[1]);
2365      }
2366      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2367         STORE(&d[chan_index], 0, chan_index);
2368      }
2369      break;
2370
2371   case TGSI_OPCODE_ADD:
2372      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2373         FETCH( &r[0], 0, chan_index );
2374         FETCH( &r[1], 1, chan_index );
2375         micro_add(&d[chan_index], &r[0], &r[1]);
2376      }
2377      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2378         STORE(&d[chan_index], 0, chan_index);
2379      }
2380      break;
2381
2382   case TGSI_OPCODE_DP3:
2383   /* TGSI_OPCODE_DOT3 */
2384      FETCH( &r[0], 0, CHAN_X );
2385      FETCH( &r[1], 1, CHAN_X );
2386      micro_mul( &r[0], &r[0], &r[1] );
2387
2388      FETCH( &r[1], 0, CHAN_Y );
2389      FETCH( &r[2], 1, CHAN_Y );
2390      micro_mul( &r[1], &r[1], &r[2] );
2391      micro_add( &r[0], &r[0], &r[1] );
2392
2393      FETCH( &r[1], 0, CHAN_Z );
2394      FETCH( &r[2], 1, CHAN_Z );
2395      micro_mul( &r[1], &r[1], &r[2] );
2396      micro_add( &r[0], &r[0], &r[1] );
2397
2398      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2399         STORE( &r[0], 0, chan_index );
2400      }
2401      break;
2402
2403    case TGSI_OPCODE_DP4:
2404    /* TGSI_OPCODE_DOT4 */
2405       FETCH(&r[0], 0, CHAN_X);
2406       FETCH(&r[1], 1, CHAN_X);
2407
2408       micro_mul( &r[0], &r[0], &r[1] );
2409
2410       FETCH(&r[1], 0, CHAN_Y);
2411       FETCH(&r[2], 1, CHAN_Y);
2412
2413       micro_mul( &r[1], &r[1], &r[2] );
2414       micro_add( &r[0], &r[0], &r[1] );
2415
2416       FETCH(&r[1], 0, CHAN_Z);
2417       FETCH(&r[2], 1, CHAN_Z);
2418
2419       micro_mul( &r[1], &r[1], &r[2] );
2420       micro_add( &r[0], &r[0], &r[1] );
2421
2422       FETCH(&r[1], 0, CHAN_W);
2423       FETCH(&r[2], 1, CHAN_W);
2424
2425       micro_mul( &r[1], &r[1], &r[2] );
2426       micro_add( &r[0], &r[0], &r[1] );
2427
2428      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2429         STORE( &r[0], 0, chan_index );
2430      }
2431      break;
2432
2433   case TGSI_OPCODE_DST:
2434      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2435         FETCH( &r[0], 0, CHAN_Y );
2436         FETCH( &r[1], 1, CHAN_Y);
2437         micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2438      }
2439      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2440         FETCH(&d[CHAN_Z], 0, CHAN_Z);
2441      }
2442      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2443         FETCH(&d[CHAN_W], 1, CHAN_W);
2444      }
2445
2446      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2447         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X);
2448      }
2449      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2450         STORE(&d[CHAN_Y], 0, CHAN_Y);
2451      }
2452      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2453         STORE(&d[CHAN_Z], 0, CHAN_Z);
2454      }
2455      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2456         STORE(&d[CHAN_W], 0, CHAN_W);
2457      }
2458      break;
2459
2460   case TGSI_OPCODE_MIN:
2461      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2462         FETCH(&r[0], 0, chan_index);
2463         FETCH(&r[1], 1, chan_index);
2464
2465         /* XXX use micro_min()?? */
2466         micro_lt(&d[chan_index], &r[0], &r[1], &r[0], &r[1]);
2467      }
2468      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2469         STORE(&d[chan_index], 0, chan_index);
2470      }
2471      break;
2472
2473   case TGSI_OPCODE_MAX:
2474      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2475         FETCH(&r[0], 0, chan_index);
2476         FETCH(&r[1], 1, chan_index);
2477
2478         /* XXX use micro_max()?? */
2479         micro_lt(&d[chan_index], &r[0], &r[1], &r[1], &r[0] );
2480      }
2481      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2482         STORE(&d[chan_index], 0, chan_index);
2483      }
2484      break;
2485
2486   case TGSI_OPCODE_SLT:
2487      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2488      break;
2489
2490   case TGSI_OPCODE_SGE:
2491      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2492      break;
2493
2494   case TGSI_OPCODE_MAD:
2495      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2496      break;
2497
2498   case TGSI_OPCODE_SUB:
2499      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2500         FETCH(&r[0], 0, chan_index);
2501         FETCH(&r[1], 1, chan_index);
2502         micro_sub(&d[chan_index], &r[0], &r[1]);
2503      }
2504      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2505         STORE(&d[chan_index], 0, chan_index);
2506      }
2507      break;
2508
2509   case TGSI_OPCODE_LRP:
2510      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2511      break;
2512
2513   case TGSI_OPCODE_CND:
2514      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2515         FETCH(&r[0], 0, chan_index);
2516         FETCH(&r[1], 1, chan_index);
2517         FETCH(&r[2], 2, chan_index);
2518         micro_lt(&d[chan_index], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]);
2519      }
2520      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2521         STORE(&d[chan_index], 0, chan_index);
2522      }
2523      break;
2524
2525   case TGSI_OPCODE_DP2A:
2526      FETCH( &r[0], 0, CHAN_X );
2527      FETCH( &r[1], 1, CHAN_X );
2528      micro_mul( &r[0], &r[0], &r[1] );
2529
2530      FETCH( &r[1], 0, CHAN_Y );
2531      FETCH( &r[2], 1, CHAN_Y );
2532      micro_mul( &r[1], &r[1], &r[2] );
2533      micro_add( &r[0], &r[0], &r[1] );
2534
2535      FETCH( &r[2], 2, CHAN_X );
2536      micro_add( &r[0], &r[0], &r[2] );
2537
2538      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2539         STORE( &r[0], 0, chan_index );
2540      }
2541      break;
2542
2543   case TGSI_OPCODE_FRC:
2544      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2545      break;
2546
2547   case TGSI_OPCODE_CLAMP:
2548      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2549         FETCH(&r[0], 0, chan_index);
2550         FETCH(&r[1], 1, chan_index);
2551         micro_max(&r[0], &r[0], &r[1]);
2552         FETCH(&r[1], 2, chan_index);
2553         micro_min(&d[chan_index], &r[0], &r[1]);
2554      }
2555      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2556         STORE(&d[chan_index], 0, chan_index);
2557      }
2558      break;
2559
2560   case TGSI_OPCODE_FLR:
2561      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2562      break;
2563
2564   case TGSI_OPCODE_ROUND:
2565      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2566      break;
2567
2568   case TGSI_OPCODE_EX2:
2569      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2570      break;
2571
2572   case TGSI_OPCODE_LG2:
2573      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2574      break;
2575
2576   case TGSI_OPCODE_POW:
2577      FETCH(&r[0], 0, CHAN_X);
2578      FETCH(&r[1], 1, CHAN_X);
2579
2580      micro_pow( &r[0], &r[0], &r[1] );
2581
2582      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2583         STORE( &r[0], 0, chan_index );
2584      }
2585      break;
2586
2587   case TGSI_OPCODE_XPD:
2588      FETCH(&r[0], 0, CHAN_Y);
2589      FETCH(&r[1], 1, CHAN_Z);
2590
2591      micro_mul( &r[2], &r[0], &r[1] );
2592
2593      FETCH(&r[3], 0, CHAN_Z);
2594      FETCH(&r[4], 1, CHAN_Y);
2595
2596      micro_mul( &r[5], &r[3], &r[4] );
2597      micro_sub(&d[CHAN_X], &r[2], &r[5]);
2598
2599      FETCH(&r[2], 1, CHAN_X);
2600
2601      micro_mul( &r[3], &r[3], &r[2] );
2602
2603      FETCH(&r[5], 0, CHAN_X);
2604
2605      micro_mul( &r[1], &r[1], &r[5] );
2606      micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2607
2608      micro_mul( &r[5], &r[5], &r[4] );
2609      micro_mul( &r[0], &r[0], &r[2] );
2610      micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2611
2612      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2613         STORE(&d[CHAN_X], 0, CHAN_X);
2614      }
2615      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2616         STORE(&d[CHAN_Y], 0, CHAN_Y);
2617      }
2618      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2619         STORE(&d[CHAN_Z], 0, CHAN_Z);
2620      }
2621      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2622         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2623      }
2624      break;
2625
2626   case TGSI_OPCODE_ABS:
2627      exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2628      break;
2629
2630   case TGSI_OPCODE_RCC:
2631      FETCH(&r[0], 0, CHAN_X);
2632      micro_div(&r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0]);
2633      micro_float_clamp(&r[0], &r[0]);
2634      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2635         STORE(&r[0], 0, chan_index);
2636      }
2637      break;
2638
2639   case TGSI_OPCODE_DPH:
2640      FETCH(&r[0], 0, CHAN_X);
2641      FETCH(&r[1], 1, CHAN_X);
2642
2643      micro_mul( &r[0], &r[0], &r[1] );
2644
2645      FETCH(&r[1], 0, CHAN_Y);
2646      FETCH(&r[2], 1, CHAN_Y);
2647
2648      micro_mul( &r[1], &r[1], &r[2] );
2649      micro_add( &r[0], &r[0], &r[1] );
2650
2651      FETCH(&r[1], 0, CHAN_Z);
2652      FETCH(&r[2], 1, CHAN_Z);
2653
2654      micro_mul( &r[1], &r[1], &r[2] );
2655      micro_add( &r[0], &r[0], &r[1] );
2656
2657      FETCH(&r[1], 1, CHAN_W);
2658
2659      micro_add( &r[0], &r[0], &r[1] );
2660
2661      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2662         STORE( &r[0], 0, chan_index );
2663      }
2664      break;
2665
2666   case TGSI_OPCODE_COS:
2667      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2668      break;
2669
2670   case TGSI_OPCODE_DDX:
2671      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2672      break;
2673
2674   case TGSI_OPCODE_DDY:
2675      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2676      break;
2677
2678   case TGSI_OPCODE_KILP:
2679      exec_kilp (mach, inst);
2680      break;
2681
2682   case TGSI_OPCODE_KIL:
2683      exec_kil (mach, inst);
2684      break;
2685
2686   case TGSI_OPCODE_PK2H:
2687      assert (0);
2688      break;
2689
2690   case TGSI_OPCODE_PK2US:
2691      assert (0);
2692      break;
2693
2694   case TGSI_OPCODE_PK4B:
2695      assert (0);
2696      break;
2697
2698   case TGSI_OPCODE_PK4UB:
2699      assert (0);
2700      break;
2701
2702   case TGSI_OPCODE_RFL:
2703      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2704          IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2705          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2706         /* r0 = dp3(src0, src0) */
2707         FETCH(&r[2], 0, CHAN_X);
2708         micro_mul(&r[0], &r[2], &r[2]);
2709         FETCH(&r[4], 0, CHAN_Y);
2710         micro_mul(&r[8], &r[4], &r[4]);
2711         micro_add(&r[0], &r[0], &r[8]);
2712         FETCH(&r[6], 0, CHAN_Z);
2713         micro_mul(&r[8], &r[6], &r[6]);
2714         micro_add(&r[0], &r[0], &r[8]);
2715
2716         /* r1 = dp3(src0, src1) */
2717         FETCH(&r[3], 1, CHAN_X);
2718         micro_mul(&r[1], &r[2], &r[3]);
2719         FETCH(&r[5], 1, CHAN_Y);
2720         micro_mul(&r[8], &r[4], &r[5]);
2721         micro_add(&r[1], &r[1], &r[8]);
2722         FETCH(&r[7], 1, CHAN_Z);
2723         micro_mul(&r[8], &r[6], &r[7]);
2724         micro_add(&r[1], &r[1], &r[8]);
2725
2726         /* r1 = 2 * r1 / r0 */
2727         micro_add(&r[1], &r[1], &r[1]);
2728         micro_div(&r[1], &r[1], &r[0]);
2729
2730         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2731            micro_mul(&r[2], &r[2], &r[1]);
2732            micro_sub(&r[2], &r[2], &r[3]);
2733            STORE(&r[2], 0, CHAN_X);
2734         }
2735         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2736            micro_mul(&r[4], &r[4], &r[1]);
2737            micro_sub(&r[4], &r[4], &r[5]);
2738            STORE(&r[4], 0, CHAN_Y);
2739         }
2740         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2741            micro_mul(&r[6], &r[6], &r[1]);
2742            micro_sub(&r[6], &r[6], &r[7]);
2743            STORE(&r[6], 0, CHAN_Z);
2744         }
2745      }
2746      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2747         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2748      }
2749      break;
2750
2751   case TGSI_OPCODE_SEQ:
2752      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2753      break;
2754
2755   case TGSI_OPCODE_SFL:
2756      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2757         STORE(&mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, chan_index);
2758      }
2759      break;
2760
2761   case TGSI_OPCODE_SGT:
2762      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2763      break;
2764
2765   case TGSI_OPCODE_SIN:
2766      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2767      break;
2768
2769   case TGSI_OPCODE_SLE:
2770      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2771      break;
2772
2773   case TGSI_OPCODE_SNE:
2774      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2775      break;
2776
2777   case TGSI_OPCODE_STR:
2778      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2779         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, chan_index);
2780      }
2781      break;
2782
2783   case TGSI_OPCODE_TEX:
2784      /* simple texture lookup */
2785      /* src[0] = texcoord */
2786      /* src[1] = sampler unit */
2787      exec_tex(mach, inst, TEX_MODIFIER_NONE);
2788      break;
2789
2790   case TGSI_OPCODE_TXB:
2791      /* Texture lookup with lod bias */
2792      /* src[0] = texcoord (src[0].w = LOD bias) */
2793      /* src[1] = sampler unit */
2794      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
2795      break;
2796
2797   case TGSI_OPCODE_TXD:
2798      /* Texture lookup with explict partial derivatives */
2799      /* src[0] = texcoord */
2800      /* src[1] = d[strq]/dx */
2801      /* src[2] = d[strq]/dy */
2802      /* src[3] = sampler unit */
2803      exec_txd(mach, inst);
2804      break;
2805
2806   case TGSI_OPCODE_TXL:
2807      /* Texture lookup with explit LOD */
2808      /* src[0] = texcoord (src[0].w = LOD) */
2809      /* src[1] = sampler unit */
2810      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
2811      break;
2812
2813   case TGSI_OPCODE_TXP:
2814      /* Texture lookup with projection */
2815      /* src[0] = texcoord (src[0].w = projection) */
2816      /* src[1] = sampler unit */
2817      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
2818      break;
2819
2820   case TGSI_OPCODE_UP2H:
2821      assert (0);
2822      break;
2823
2824   case TGSI_OPCODE_UP2US:
2825      assert (0);
2826      break;
2827
2828   case TGSI_OPCODE_UP4B:
2829      assert (0);
2830      break;
2831
2832   case TGSI_OPCODE_UP4UB:
2833      assert (0);
2834      break;
2835
2836   case TGSI_OPCODE_X2D:
2837      FETCH(&r[0], 1, CHAN_X);
2838      FETCH(&r[1], 1, CHAN_Y);
2839      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2840          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2841         FETCH(&r[2], 2, CHAN_X);
2842         micro_mul(&r[2], &r[2], &r[0]);
2843         FETCH(&r[3], 2, CHAN_Y);
2844         micro_mul(&r[3], &r[3], &r[1]);
2845         micro_add(&r[2], &r[2], &r[3]);
2846         FETCH(&r[3], 0, CHAN_X);
2847         micro_add(&d[CHAN_X], &r[2], &r[3]);
2848
2849      }
2850      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2851          IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2852         FETCH(&r[2], 2, CHAN_Z);
2853         micro_mul(&r[2], &r[2], &r[0]);
2854         FETCH(&r[3], 2, CHAN_W);
2855         micro_mul(&r[3], &r[3], &r[1]);
2856         micro_add(&r[2], &r[2], &r[3]);
2857         FETCH(&r[3], 0, CHAN_Y);
2858         micro_add(&d[CHAN_Y], &r[2], &r[3]);
2859
2860      }
2861      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2862         STORE(&d[CHAN_X], 0, CHAN_X);
2863      }
2864      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2865         STORE(&d[CHAN_Y], 0, CHAN_Y);
2866      }
2867      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2868         STORE(&d[CHAN_X], 0, CHAN_Z);
2869      }
2870      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2871         STORE(&d[CHAN_Y], 0, CHAN_W);
2872      }
2873      break;
2874
2875   case TGSI_OPCODE_ARA:
2876      assert (0);
2877      break;
2878
2879   case TGSI_OPCODE_ARR:
2880      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
2881      break;
2882
2883   case TGSI_OPCODE_BRA:
2884      assert (0);
2885      break;
2886
2887   case TGSI_OPCODE_CAL:
2888      /* skip the call if no execution channels are enabled */
2889      if (mach->ExecMask) {
2890         /* do the call */
2891
2892         /* First, record the depths of the execution stacks.
2893          * This is important for deeply nested/looped return statements.
2894          * We have to unwind the stacks by the correct amount.  For a
2895          * real code generator, we could determine the number of entries
2896          * to pop off each stack with simple static analysis and avoid
2897          * implementing this data structure at run time.
2898          */
2899         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
2900         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
2901         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
2902         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
2903         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
2904         /* note that PC was already incremented above */
2905         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
2906
2907         mach->CallStackTop++;
2908
2909         /* Second, push the Cond, Loop, Cont, Func stacks */
2910         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2911         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2912         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2913         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
2914         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
2915         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
2916
2917         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2918         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2919         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2920         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
2921         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
2922         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
2923
2924         /* Finally, jump to the subroutine */
2925         *pc = inst->Label.Label;
2926      }
2927      break;
2928
2929   case TGSI_OPCODE_RET:
2930      mach->FuncMask &= ~mach->ExecMask;
2931      UPDATE_EXEC_MASK(mach);
2932
2933      if (mach->FuncMask == 0x0) {
2934         /* really return now (otherwise, keep executing */
2935
2936         if (mach->CallStackTop == 0) {
2937            /* returning from main() */
2938            *pc = -1;
2939            return;
2940         }
2941
2942         assert(mach->CallStackTop > 0);
2943         mach->CallStackTop--;
2944
2945         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
2946         mach->CondMask = mach->CondStack[mach->CondStackTop];
2947
2948         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
2949         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
2950
2951         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
2952         mach->ContMask = mach->ContStack[mach->ContStackTop];
2953
2954         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
2955         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
2956
2957         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
2958         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
2959
2960         assert(mach->FuncStackTop > 0);
2961         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
2962
2963         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
2964
2965         UPDATE_EXEC_MASK(mach);
2966      }
2967      break;
2968
2969   case TGSI_OPCODE_SSG:
2970      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2971      break;
2972
2973   case TGSI_OPCODE_CMP:
2974      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2975         FETCH(&r[0], 0, chan_index);
2976         FETCH(&r[1], 1, chan_index);
2977         FETCH(&r[2], 2, chan_index);
2978         micro_lt(&d[chan_index], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2]);
2979      }
2980      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2981         STORE(&d[chan_index], 0, chan_index);
2982      }
2983      break;
2984
2985   case TGSI_OPCODE_SCS:
2986      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2987         FETCH( &r[0], 0, CHAN_X );
2988         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2989            micro_cos(&r[1], &r[0]);
2990            STORE(&r[1], 0, CHAN_X);
2991         }
2992         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2993            micro_sin(&r[1], &r[0]);
2994            STORE(&r[1], 0, CHAN_Y);
2995         }
2996      }
2997      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2998         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
2999      }
3000      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
3001         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
3002      }
3003      break;
3004
3005   case TGSI_OPCODE_NRM:
3006      /* 3-component vector normalize */
3007      if(IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
3008         IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
3009         IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
3010         /* r3 = sqrt(dp3(src0, src0)) */
3011         FETCH(&r[0], 0, CHAN_X);
3012         micro_mul(&r[3], &r[0], &r[0]);
3013         FETCH(&r[1], 0, CHAN_Y);
3014         micro_mul(&r[4], &r[1], &r[1]);
3015         micro_add(&r[3], &r[3], &r[4]);
3016         FETCH(&r[2], 0, CHAN_Z);
3017         micro_mul(&r[4], &r[2], &r[2]);
3018         micro_add(&r[3], &r[3], &r[4]);
3019         micro_sqrt(&r[3], &r[3]);
3020
3021         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
3022            micro_div(&r[0], &r[0], &r[3]);
3023            STORE(&r[0], 0, CHAN_X);
3024         }
3025         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
3026            micro_div(&r[1], &r[1], &r[3]);
3027            STORE(&r[1], 0, CHAN_Y);
3028         }
3029         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
3030            micro_div(&r[2], &r[2], &r[3]);
3031            STORE(&r[2], 0, CHAN_Z);
3032         }
3033      }
3034      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
3035         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
3036      }
3037      break;
3038
3039   case TGSI_OPCODE_NRM4:
3040      /* 4-component vector normalize */
3041      {
3042         union tgsi_exec_channel tmp, dot;
3043
3044         /* tmp = dp4(src0, src0): */
3045         FETCH( &r[0], 0, CHAN_X );
3046         micro_mul( &tmp, &r[0], &r[0] );
3047
3048         FETCH( &r[1], 0, CHAN_Y );
3049         micro_mul( &dot, &r[1], &r[1] );
3050         micro_add( &tmp, &tmp, &dot );
3051
3052         FETCH( &r[2], 0, CHAN_Z );
3053         micro_mul( &dot, &r[2], &r[2] );
3054         micro_add( &tmp, &tmp, &dot );
3055
3056         FETCH( &r[3], 0, CHAN_W );
3057         micro_mul( &dot, &r[3], &r[3] );
3058         micro_add( &tmp, &tmp, &dot );
3059
3060         /* tmp = 1 / sqrt(tmp) */
3061         micro_sqrt( &tmp, &tmp );
3062         micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
3063
3064         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3065            /* chan = chan * tmp */
3066            micro_mul( &r[chan_index], &tmp, &r[chan_index] );
3067            STORE( &r[chan_index], 0, chan_index );
3068         }
3069      }
3070      break;
3071
3072   case TGSI_OPCODE_DIV:
3073      assert( 0 );
3074      break;
3075
3076   case TGSI_OPCODE_DP2:
3077      FETCH( &r[0], 0, CHAN_X );
3078      FETCH( &r[1], 1, CHAN_X );
3079      micro_mul( &r[0], &r[0], &r[1] );
3080
3081      FETCH( &r[1], 0, CHAN_Y );
3082      FETCH( &r[2], 1, CHAN_Y );
3083      micro_mul( &r[1], &r[1], &r[2] );
3084      micro_add( &r[0], &r[0], &r[1] );
3085
3086      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3087         STORE( &r[0], 0, chan_index );
3088      }
3089      break;
3090
3091   case TGSI_OPCODE_IF:
3092      /* push CondMask */
3093      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3094      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3095      FETCH( &r[0], 0, CHAN_X );
3096      /* update CondMask */
3097      if( ! r[0].u[0] ) {
3098         mach->CondMask &= ~0x1;
3099      }
3100      if( ! r[0].u[1] ) {
3101         mach->CondMask &= ~0x2;
3102      }
3103      if( ! r[0].u[2] ) {
3104         mach->CondMask &= ~0x4;
3105      }
3106      if( ! r[0].u[3] ) {
3107         mach->CondMask &= ~0x8;
3108      }
3109      UPDATE_EXEC_MASK(mach);
3110      /* Todo: If CondMask==0, jump to ELSE */
3111      break;
3112
3113   case TGSI_OPCODE_ELSE:
3114      /* invert CondMask wrt previous mask */
3115      {
3116         uint prevMask;
3117         assert(mach->CondStackTop > 0);
3118         prevMask = mach->CondStack[mach->CondStackTop - 1];
3119         mach->CondMask = ~mach->CondMask & prevMask;
3120         UPDATE_EXEC_MASK(mach);
3121         /* Todo: If CondMask==0, jump to ENDIF */
3122      }
3123      break;
3124
3125   case TGSI_OPCODE_ENDIF:
3126      /* pop CondMask */
3127      assert(mach->CondStackTop > 0);
3128      mach->CondMask = mach->CondStack[--mach->CondStackTop];
3129      UPDATE_EXEC_MASK(mach);
3130      break;
3131
3132   case TGSI_OPCODE_END:
3133      /* halt execution */
3134      *pc = -1;
3135      break;
3136
3137   case TGSI_OPCODE_REP:
3138      assert (0);
3139      break;
3140
3141   case TGSI_OPCODE_ENDREP:
3142       assert (0);
3143       break;
3144
3145   case TGSI_OPCODE_PUSHA:
3146      assert (0);
3147      break;
3148
3149   case TGSI_OPCODE_POPA:
3150      assert (0);
3151      break;
3152
3153   case TGSI_OPCODE_CEIL:
3154      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3155      break;
3156
3157   case TGSI_OPCODE_I2F:
3158      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
3159      break;
3160
3161   case TGSI_OPCODE_NOT:
3162      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3163      break;
3164
3165   case TGSI_OPCODE_TRUNC:
3166      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3167      break;
3168
3169   case TGSI_OPCODE_SHL:
3170      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3171      break;
3172
3173   case TGSI_OPCODE_AND:
3174      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3175      break;
3176
3177   case TGSI_OPCODE_OR:
3178      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3179      break;
3180
3181   case TGSI_OPCODE_MOD:
3182      assert (0);
3183      break;
3184
3185   case TGSI_OPCODE_XOR:
3186      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3187      break;
3188
3189   case TGSI_OPCODE_SAD:
3190      assert (0);
3191      break;
3192
3193   case TGSI_OPCODE_TXF:
3194      assert (0);
3195      break;
3196
3197   case TGSI_OPCODE_TXQ:
3198      assert (0);
3199      break;
3200
3201   case TGSI_OPCODE_EMIT:
3202      emit_vertex(mach);
3203      break;
3204
3205   case TGSI_OPCODE_ENDPRIM:
3206      emit_primitive(mach);
3207      break;
3208
3209   case TGSI_OPCODE_BGNFOR:
3210      assert(mach->LoopCounterStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3211      for (chan_index = 0; chan_index < 3; chan_index++) {
3212         FETCH( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[chan_index], 0, chan_index );
3213      }
3214      ++mach->LoopCounterStackTop;
3215      STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X], 0, CHAN_X);
3216      /* update LoopMask */
3217      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3218         mach->LoopMask &= ~0x1;
3219      }
3220      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3221         mach->LoopMask &= ~0x2;
3222      }
3223      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3224         mach->LoopMask &= ~0x4;
3225      }
3226      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3227         mach->LoopMask &= ~0x8;
3228      }
3229      /* TODO: if mach->LoopMask == 0, jump to end of loop */
3230      UPDATE_EXEC_MASK(mach);
3231      /* fall-through (for now) */
3232   case TGSI_OPCODE_BGNLOOP:
3233      /* push LoopMask and ContMasks */
3234      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3235      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3236      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3237      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3238
3239      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3240      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3241      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3242      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3243      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
3244      break;
3245
3246   case TGSI_OPCODE_ENDFOR:
3247      assert(mach->LoopCounterStackTop > 0);
3248      micro_sub(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3249                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3250                &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
3251      /* update LoopMask */
3252      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3253         mach->LoopMask &= ~0x1;
3254      }
3255      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3256         mach->LoopMask &= ~0x2;
3257      }
3258      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3259         mach->LoopMask &= ~0x4;
3260      }
3261      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3262         mach->LoopMask &= ~0x8;
3263      }
3264      micro_add(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3265                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3266                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Z]);
3267      assert(mach->LoopLabelStackTop > 0);
3268      inst = mach->Instructions + mach->LoopLabelStack[mach->LoopLabelStackTop - 1];
3269      STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_X], 0, CHAN_X);
3270      /* Restore ContMask, but don't pop */
3271      assert(mach->ContStackTop > 0);
3272      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3273      UPDATE_EXEC_MASK(mach);
3274      if (mach->ExecMask) {
3275         /* repeat loop: jump to instruction just past BGNLOOP */
3276         assert(mach->LoopLabelStackTop > 0);
3277         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3278      }
3279      else {
3280         /* exit loop: pop LoopMask */
3281         assert(mach->LoopStackTop > 0);
3282         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3283         /* pop ContMask */
3284         assert(mach->ContStackTop > 0);
3285         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3286         assert(mach->LoopLabelStackTop > 0);
3287         --mach->LoopLabelStackTop;
3288         assert(mach->LoopCounterStackTop > 0);
3289         --mach->LoopCounterStackTop;
3290
3291         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3292      }
3293      UPDATE_EXEC_MASK(mach);
3294      break;
3295
3296   case TGSI_OPCODE_ENDLOOP:
3297      /* Restore ContMask, but don't pop */
3298      assert(mach->ContStackTop > 0);
3299      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3300      UPDATE_EXEC_MASK(mach);
3301      if (mach->ExecMask) {
3302         /* repeat loop: jump to instruction just past BGNLOOP */
3303         assert(mach->LoopLabelStackTop > 0);
3304         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3305      }
3306      else {
3307         /* exit loop: pop LoopMask */
3308         assert(mach->LoopStackTop > 0);
3309         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3310         /* pop ContMask */
3311         assert(mach->ContStackTop > 0);
3312         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3313         assert(mach->LoopLabelStackTop > 0);
3314         --mach->LoopLabelStackTop;
3315
3316         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3317      }
3318      UPDATE_EXEC_MASK(mach);
3319      break;
3320
3321   case TGSI_OPCODE_BRK:
3322      exec_break(mach);
3323      break;
3324
3325   case TGSI_OPCODE_CONT:
3326      /* turn off cont channels for each enabled exec channel */
3327      mach->ContMask &= ~mach->ExecMask;
3328      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3329      UPDATE_EXEC_MASK(mach);
3330      break;
3331
3332   case TGSI_OPCODE_BGNSUB:
3333      /* no-op */
3334      break;
3335
3336   case TGSI_OPCODE_ENDSUB:
3337      /*
3338       * XXX: This really should be a no-op. We should never reach this opcode.
3339       */
3340
3341      assert(mach->CallStackTop > 0);
3342      mach->CallStackTop--;
3343
3344      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3345      mach->CondMask = mach->CondStack[mach->CondStackTop];
3346
3347      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3348      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3349
3350      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3351      mach->ContMask = mach->ContStack[mach->ContStackTop];
3352
3353      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3354      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3355
3356      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3357      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3358
3359      assert(mach->FuncStackTop > 0);
3360      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3361
3362      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3363
3364      UPDATE_EXEC_MASK(mach);
3365      break;
3366
3367   case TGSI_OPCODE_NOP:
3368      break;
3369
3370   case TGSI_OPCODE_BREAKC:
3371      FETCH(&r[0], 0, CHAN_X);
3372      /* update CondMask */
3373      if (r[0].u[0] && (mach->ExecMask & 0x1)) {
3374         mach->LoopMask &= ~0x1;
3375      }
3376      if (r[0].u[1] && (mach->ExecMask & 0x2)) {
3377         mach->LoopMask &= ~0x2;
3378      }
3379      if (r[0].u[2] && (mach->ExecMask & 0x4)) {
3380         mach->LoopMask &= ~0x4;
3381      }
3382      if (r[0].u[3] && (mach->ExecMask & 0x8)) {
3383         mach->LoopMask &= ~0x8;
3384      }
3385      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3386      UPDATE_EXEC_MASK(mach);
3387      break;
3388
3389   case TGSI_OPCODE_F2I:
3390      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3391      break;
3392
3393   case TGSI_OPCODE_IDIV:
3394      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3395      break;
3396
3397   case TGSI_OPCODE_IMAX:
3398      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3399      break;
3400
3401   case TGSI_OPCODE_IMIN:
3402      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3403      break;
3404
3405   case TGSI_OPCODE_INEG:
3406      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3407      break;
3408
3409   case TGSI_OPCODE_ISGE:
3410      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3411      break;
3412
3413   case TGSI_OPCODE_ISHR:
3414      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3415      break;
3416
3417   case TGSI_OPCODE_ISLT:
3418      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3419      break;
3420
3421   case TGSI_OPCODE_F2U:
3422      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3423      break;
3424
3425   case TGSI_OPCODE_U2F:
3426      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
3427      break;
3428
3429   case TGSI_OPCODE_UADD:
3430      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3431      break;
3432
3433   case TGSI_OPCODE_UDIV:
3434      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3435      break;
3436
3437   case TGSI_OPCODE_UMAD:
3438      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3439      break;
3440
3441   case TGSI_OPCODE_UMAX:
3442      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3443      break;
3444
3445   case TGSI_OPCODE_UMIN:
3446      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3447      break;
3448
3449   case TGSI_OPCODE_UMOD:
3450      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3451      break;
3452
3453   case TGSI_OPCODE_UMUL:
3454      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3455      break;
3456
3457   case TGSI_OPCODE_USEQ:
3458      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3459      break;
3460
3461   case TGSI_OPCODE_USGE:
3462      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3463      break;
3464
3465   case TGSI_OPCODE_USHR:
3466      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3467      break;
3468
3469   case TGSI_OPCODE_USLT:
3470      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3471      break;
3472
3473   case TGSI_OPCODE_USNE:
3474      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3475      break;
3476
3477   case TGSI_OPCODE_SWITCH:
3478      exec_switch(mach, inst);
3479      break;
3480
3481   case TGSI_OPCODE_CASE:
3482      exec_case(mach, inst);
3483      break;
3484
3485   case TGSI_OPCODE_DEFAULT:
3486      exec_default(mach);
3487      break;
3488
3489   case TGSI_OPCODE_ENDSWITCH:
3490      exec_endswitch(mach);
3491      break;
3492
3493   default:
3494      assert( 0 );
3495   }
3496}
3497
3498
3499#define DEBUG_EXECUTION 0
3500
3501
3502/**
3503 * Run TGSI interpreter.
3504 * \return bitmask of "alive" quad components
3505 */
3506uint
3507tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
3508{
3509   uint i;
3510   int pc = 0;
3511
3512   mach->CondMask = 0xf;
3513   mach->LoopMask = 0xf;
3514   mach->ContMask = 0xf;
3515   mach->FuncMask = 0xf;
3516   mach->ExecMask = 0xf;
3517
3518   mach->Switch.mask = 0xf;
3519
3520   assert(mach->CondStackTop == 0);
3521   assert(mach->LoopStackTop == 0);
3522   assert(mach->ContStackTop == 0);
3523   assert(mach->SwitchStackTop == 0);
3524   assert(mach->BreakStackTop == 0);
3525   assert(mach->CallStackTop == 0);
3526
3527   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
3528   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
3529
3530   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
3531      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
3532      mach->Primitives[0] = 0;
3533   }
3534
3535   for (i = 0; i < QUAD_SIZE; i++) {
3536      mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
3537         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
3538         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
3539         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
3540         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
3541   }
3542
3543   /* execute declarations (interpolants) */
3544   for (i = 0; i < mach->NumDeclarations; i++) {
3545      exec_declaration( mach, mach->Declarations+i );
3546   }
3547
3548   {
3549#if DEBUG_EXECUTION
3550      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
3551      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
3552      uint inst = 1;
3553
3554      memcpy(temps, mach->Temps, sizeof(temps));
3555      memcpy(outputs, mach->Outputs, sizeof(outputs));
3556#endif
3557
3558      /* execute instructions, until pc is set to -1 */
3559      while (pc != -1) {
3560
3561#if DEBUG_EXECUTION
3562         uint i;
3563
3564         tgsi_dump_instruction(&mach->Instructions[pc], inst++);
3565#endif
3566
3567         assert(pc < (int) mach->NumInstructions);
3568         exec_instruction(mach, mach->Instructions + pc, &pc);
3569
3570#if DEBUG_EXECUTION
3571         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
3572            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
3573               uint j;
3574
3575               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
3576               debug_printf("TEMP[%2u] = ", i);
3577               for (j = 0; j < 4; j++) {
3578                  if (j > 0) {
3579                     debug_printf("           ");
3580                  }
3581                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3582                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
3583                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
3584                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
3585                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
3586               }
3587            }
3588         }
3589         for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
3590            if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
3591               uint j;
3592
3593               memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
3594               debug_printf("OUT[%2u] =  ", i);
3595               for (j = 0; j < 4; j++) {
3596                  if (j > 0) {
3597                     debug_printf("           ");
3598                  }
3599                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3600                               outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
3601                               outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
3602                               outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
3603                               outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
3604               }
3605            }
3606         }
3607#endif
3608      }
3609   }
3610
3611#if 0
3612   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
3613   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
3614      /*
3615       * Scale back depth component.
3616       */
3617      for (i = 0; i < 4; i++)
3618         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
3619   }
3620#endif
3621
3622   assert(mach->CondStackTop == 0);
3623   assert(mach->LoopStackTop == 0);
3624   assert(mach->ContStackTop == 0);
3625   assert(mach->SwitchStackTop == 0);
3626   assert(mach->BreakStackTop == 0);
3627   assert(mach->CallStackTop == 0);
3628
3629   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3630}
3631