tgsi_exec.c revision 7c5f255201f42303188137f56ea8acc030444f0e
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29/**
30 * TGSI interpreter/executor.
31 *
32 * Flow control information:
33 *
34 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
35 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
36 * care since a condition may be true for some quad components but false
37 * for other components.
38 *
39 * We basically execute all statements (even if they're in the part of
40 * an IF/ELSE clause that's "not taken") and use a special mask to
41 * control writing to destination registers.  This is the ExecMask.
42 * See store_dest().
43 *
44 * The ExecMask is computed from three other masks (CondMask, LoopMask and
45 * ContMask) which are controlled by the flow control instructions (namely:
46 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
47 *
48 *
49 * Authors:
50 *   Michal Krol
51 *   Brian Paul
52 */
53
54#include "pipe/p_compiler.h"
55#include "pipe/p_state.h"
56#include "pipe/p_shader_tokens.h"
57#include "tgsi/tgsi_dump.h"
58#include "tgsi/tgsi_parse.h"
59#include "tgsi/tgsi_util.h"
60#include "tgsi_exec.h"
61#include "util/u_memory.h"
62#include "util/u_math.h"
63
64
65#define FAST_MATH 1
66
67#define TILE_TOP_LEFT     0
68#define TILE_TOP_RIGHT    1
69#define TILE_BOTTOM_LEFT  2
70#define TILE_BOTTOM_RIGHT 3
71
72static void
73micro_abs(union tgsi_exec_channel *dst,
74          const union tgsi_exec_channel *src)
75{
76   dst->f[0] = fabsf(src->f[0]);
77   dst->f[1] = fabsf(src->f[1]);
78   dst->f[2] = fabsf(src->f[2]);
79   dst->f[3] = fabsf(src->f[3]);
80}
81
82static void
83micro_arl(union tgsi_exec_channel *dst,
84          const union tgsi_exec_channel *src)
85{
86   dst->i[0] = (int)floorf(src->f[0]);
87   dst->i[1] = (int)floorf(src->f[1]);
88   dst->i[2] = (int)floorf(src->f[2]);
89   dst->i[3] = (int)floorf(src->f[3]);
90}
91
92static void
93micro_arr(union tgsi_exec_channel *dst,
94          const union tgsi_exec_channel *src)
95{
96   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
97   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
98   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
99   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
100}
101
102static void
103micro_ceil(union tgsi_exec_channel *dst,
104           const union tgsi_exec_channel *src)
105{
106   dst->f[0] = ceilf(src->f[0]);
107   dst->f[1] = ceilf(src->f[1]);
108   dst->f[2] = ceilf(src->f[2]);
109   dst->f[3] = ceilf(src->f[3]);
110}
111
112static void
113micro_cos(union tgsi_exec_channel *dst,
114          const union tgsi_exec_channel *src)
115{
116   dst->f[0] = cosf(src->f[0]);
117   dst->f[1] = cosf(src->f[1]);
118   dst->f[2] = cosf(src->f[2]);
119   dst->f[3] = cosf(src->f[3]);
120}
121
122static void
123micro_ddx(union tgsi_exec_channel *dst,
124          const union tgsi_exec_channel *src)
125{
126   dst->f[0] =
127   dst->f[1] =
128   dst->f[2] =
129   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
130}
131
132static void
133micro_ddy(union tgsi_exec_channel *dst,
134          const union tgsi_exec_channel *src)
135{
136   dst->f[0] =
137   dst->f[1] =
138   dst->f[2] =
139   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
140}
141
142static void
143micro_exp2(union tgsi_exec_channel *dst,
144           const union tgsi_exec_channel *src)
145{
146#if FAST_MATH
147   dst->f[0] = util_fast_exp2(src->f[0]);
148   dst->f[1] = util_fast_exp2(src->f[1]);
149   dst->f[2] = util_fast_exp2(src->f[2]);
150   dst->f[3] = util_fast_exp2(src->f[3]);
151#else
152#if DEBUG
153   /* Inf is okay for this instruction, so clamp it to silence assertions. */
154   uint i;
155   union tgsi_exec_channel clamped;
156
157   for (i = 0; i < 4; i++) {
158      if (src->f[i] > 127.99999f) {
159         clamped.f[i] = 127.99999f;
160      } else if (src->f[i] < -126.99999f) {
161         clamped.f[i] = -126.99999f;
162      } else {
163         clamped.f[i] = src->f[i];
164      }
165   }
166   src = &clamped;
167#endif /* DEBUG */
168
169   dst->f[0] = powf(2.0f, src->f[0]);
170   dst->f[1] = powf(2.0f, src->f[1]);
171   dst->f[2] = powf(2.0f, src->f[2]);
172   dst->f[3] = powf(2.0f, src->f[3]);
173#endif /* FAST_MATH */
174}
175
176static void
177micro_flr(union tgsi_exec_channel *dst,
178          const union tgsi_exec_channel *src)
179{
180   dst->f[0] = floorf(src->f[0]);
181   dst->f[1] = floorf(src->f[1]);
182   dst->f[2] = floorf(src->f[2]);
183   dst->f[3] = floorf(src->f[3]);
184}
185
186static void
187micro_frc(union tgsi_exec_channel *dst,
188          const union tgsi_exec_channel *src)
189{
190   dst->f[0] = src->f[0] - floorf(src->f[0]);
191   dst->f[1] = src->f[1] - floorf(src->f[1]);
192   dst->f[2] = src->f[2] - floorf(src->f[2]);
193   dst->f[3] = src->f[3] - floorf(src->f[3]);
194}
195
196static void
197micro_iabs(union tgsi_exec_channel *dst,
198           const union tgsi_exec_channel *src)
199{
200   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
201   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
202   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
203   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
204}
205
206static void
207micro_ineg(union tgsi_exec_channel *dst,
208           const union tgsi_exec_channel *src)
209{
210   dst->i[0] = -src->i[0];
211   dst->i[1] = -src->i[1];
212   dst->i[2] = -src->i[2];
213   dst->i[3] = -src->i[3];
214}
215
216static void
217micro_lg2(union tgsi_exec_channel *dst,
218          const union tgsi_exec_channel *src)
219{
220#if FAST_MATH
221   dst->f[0] = util_fast_log2(src->f[0]);
222   dst->f[1] = util_fast_log2(src->f[1]);
223   dst->f[2] = util_fast_log2(src->f[2]);
224   dst->f[3] = util_fast_log2(src->f[3]);
225#else
226   dst->f[0] = logf(src->f[0]) * 1.442695f;
227   dst->f[1] = logf(src->f[1]) * 1.442695f;
228   dst->f[2] = logf(src->f[2]) * 1.442695f;
229   dst->f[3] = logf(src->f[3]) * 1.442695f;
230#endif
231}
232
233static void
234micro_lrp(union tgsi_exec_channel *dst,
235          const union tgsi_exec_channel *src)
236{
237   dst->f[0] = src[0].f[0] * (src[1].f[0] - src[2].f[0]) + src[2].f[0];
238   dst->f[1] = src[0].f[1] * (src[1].f[1] - src[2].f[1]) + src[2].f[1];
239   dst->f[2] = src[0].f[2] * (src[1].f[2] - src[2].f[2]) + src[2].f[2];
240   dst->f[3] = src[0].f[3] * (src[1].f[3] - src[2].f[3]) + src[2].f[3];
241}
242
243static void
244micro_mad(union tgsi_exec_channel *dst,
245          const union tgsi_exec_channel *src)
246{
247   dst->f[0] = src[0].f[0] * src[1].f[0] + src[2].f[0];
248   dst->f[1] = src[0].f[1] * src[1].f[1] + src[2].f[1];
249   dst->f[2] = src[0].f[2] * src[1].f[2] + src[2].f[2];
250   dst->f[3] = src[0].f[3] * src[1].f[3] + src[2].f[3];
251}
252
253static void
254micro_mov(union tgsi_exec_channel *dst,
255          const union tgsi_exec_channel *src)
256{
257   dst->u[0] = src->u[0];
258   dst->u[1] = src->u[1];
259   dst->u[2] = src->u[2];
260   dst->u[3] = src->u[3];
261}
262
263static void
264micro_rcp(union tgsi_exec_channel *dst,
265          const union tgsi_exec_channel *src)
266{
267   dst->f[0] = 1.0f / src->f[0];
268   dst->f[1] = 1.0f / src->f[1];
269   dst->f[2] = 1.0f / src->f[2];
270   dst->f[3] = 1.0f / src->f[3];
271}
272
273static void
274micro_rnd(union tgsi_exec_channel *dst,
275          const union tgsi_exec_channel *src)
276{
277   dst->f[0] = floorf(src->f[0] + 0.5f);
278   dst->f[1] = floorf(src->f[1] + 0.5f);
279   dst->f[2] = floorf(src->f[2] + 0.5f);
280   dst->f[3] = floorf(src->f[3] + 0.5f);
281}
282
283static void
284micro_rsq(union tgsi_exec_channel *dst,
285          const union tgsi_exec_channel *src)
286{
287   dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
288   dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
289   dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
290   dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
291}
292
293static void
294micro_seq(union tgsi_exec_channel *dst,
295          const union tgsi_exec_channel *src)
296{
297   dst->f[0] = src[0].f[0] == src[1].f[0] ? 1.0f : 0.0f;
298   dst->f[1] = src[0].f[1] == src[1].f[1] ? 1.0f : 0.0f;
299   dst->f[2] = src[0].f[2] == src[1].f[2] ? 1.0f : 0.0f;
300   dst->f[3] = src[0].f[3] == src[1].f[3] ? 1.0f : 0.0f;
301}
302
303static void
304micro_sge(union tgsi_exec_channel *dst,
305          const union tgsi_exec_channel *src)
306{
307   dst->f[0] = src[0].f[0] >= src[1].f[0] ? 1.0f : 0.0f;
308   dst->f[1] = src[0].f[1] >= src[1].f[1] ? 1.0f : 0.0f;
309   dst->f[2] = src[0].f[2] >= src[1].f[2] ? 1.0f : 0.0f;
310   dst->f[3] = src[0].f[3] >= src[1].f[3] ? 1.0f : 0.0f;
311}
312
313static void
314micro_sgn(union tgsi_exec_channel *dst,
315          const union tgsi_exec_channel *src)
316{
317   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
318   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
319   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
320   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
321}
322
323static void
324micro_sgt(union tgsi_exec_channel *dst,
325          const union tgsi_exec_channel *src)
326{
327   dst->f[0] = src[0].f[0] > src[1].f[0] ? 1.0f : 0.0f;
328   dst->f[1] = src[0].f[1] > src[1].f[1] ? 1.0f : 0.0f;
329   dst->f[2] = src[0].f[2] > src[1].f[2] ? 1.0f : 0.0f;
330   dst->f[3] = src[0].f[3] > src[1].f[3] ? 1.0f : 0.0f;
331}
332
333static void
334micro_sin(union tgsi_exec_channel *dst,
335          const union tgsi_exec_channel *src)
336{
337   dst->f[0] = sinf(src->f[0]);
338   dst->f[1] = sinf(src->f[1]);
339   dst->f[2] = sinf(src->f[2]);
340   dst->f[3] = sinf(src->f[3]);
341}
342
343static void
344micro_sle(union tgsi_exec_channel *dst,
345          const union tgsi_exec_channel *src)
346{
347   dst->f[0] = src[0].f[0] <= src[1].f[0] ? 1.0f : 0.0f;
348   dst->f[1] = src[0].f[1] <= src[1].f[1] ? 1.0f : 0.0f;
349   dst->f[2] = src[0].f[2] <= src[1].f[2] ? 1.0f : 0.0f;
350   dst->f[3] = src[0].f[3] <= src[1].f[3] ? 1.0f : 0.0f;
351}
352
353static void
354micro_slt(union tgsi_exec_channel *dst,
355          const union tgsi_exec_channel *src)
356{
357   dst->f[0] = src[0].f[0] < src[1].f[0] ? 1.0f : 0.0f;
358   dst->f[1] = src[0].f[1] < src[1].f[1] ? 1.0f : 0.0f;
359   dst->f[2] = src[0].f[2] < src[1].f[2] ? 1.0f : 0.0f;
360   dst->f[3] = src[0].f[3] < src[1].f[3] ? 1.0f : 0.0f;
361}
362
363static void
364micro_sne(union tgsi_exec_channel *dst,
365          const union tgsi_exec_channel *src)
366{
367   dst->f[0] = src[0].f[0] != src[1].f[0] ? 1.0f : 0.0f;
368   dst->f[1] = src[0].f[1] != src[1].f[1] ? 1.0f : 0.0f;
369   dst->f[2] = src[0].f[2] != src[1].f[2] ? 1.0f : 0.0f;
370   dst->f[3] = src[0].f[3] != src[1].f[3] ? 1.0f : 0.0f;
371}
372
373static void
374micro_trunc(union tgsi_exec_channel *dst,
375            const union tgsi_exec_channel *src)
376{
377   dst->f[0] = (float)(int)src->f[0];
378   dst->f[1] = (float)(int)src->f[1];
379   dst->f[2] = (float)(int)src->f[2];
380   dst->f[3] = (float)(int)src->f[3];
381}
382
383
384#define CHAN_X  0
385#define CHAN_Y  1
386#define CHAN_Z  2
387#define CHAN_W  3
388
389enum tgsi_exec_datatype {
390   TGSI_EXEC_DATA_FLOAT,
391   TGSI_EXEC_DATA_INT,
392   TGSI_EXEC_DATA_UINT
393};
394
395/*
396 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
397 */
398#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
399#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
400#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
401#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
402#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
403#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
404#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
405#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
406#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
407#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
408#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
409#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
410#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
411#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
412#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
413#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
414#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
415#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
416#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
417#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
418#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
419#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
420#define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
421#define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
422#define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
423#define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
424#define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
425#define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
426#define TEMP_R0            TGSI_EXEC_TEMP_R0
427#define TEMP_P0            TGSI_EXEC_TEMP_P0
428
429#define IS_CHANNEL_ENABLED(INST, CHAN)\
430   ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
431
432#define IS_CHANNEL_ENABLED2(INST, CHAN)\
433   ((INST).Dst[1].Register.WriteMask & (1 << (CHAN)))
434
435#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
436   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
437      if (IS_CHANNEL_ENABLED( INST, CHAN ))
438
439#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
440   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
441      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
442
443
444/** The execution mask depends on the conditional mask and the loop mask */
445#define UPDATE_EXEC_MASK(MACH) \
446      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
447
448
449static const union tgsi_exec_channel ZeroVec =
450   { { 0.0, 0.0, 0.0, 0.0 } };
451
452
453#define CHECK_INF_OR_NAN(chan) do {\
454      assert(!util_is_inf_or_nan((chan)->f[0]));\
455      assert(!util_is_inf_or_nan((chan)->f[1]));\
456      assert(!util_is_inf_or_nan((chan)->f[2]));\
457      assert(!util_is_inf_or_nan((chan)->f[3]));\
458   } while (0)
459
460
461#ifdef DEBUG
462static void
463print_chan(const char *msg, const union tgsi_exec_channel *chan)
464{
465   debug_printf("%s = {%f, %f, %f, %f}\n",
466                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
467}
468#endif
469
470
471#ifdef DEBUG
472static void
473print_temp(const struct tgsi_exec_machine *mach, uint index)
474{
475   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
476   int i;
477   debug_printf("Temp[%u] =\n", index);
478   for (i = 0; i < 4; i++) {
479      debug_printf("  %c: { %f, %f, %f, %f }\n",
480                   "XYZW"[i],
481                   tmp->xyzw[i].f[0],
482                   tmp->xyzw[i].f[1],
483                   tmp->xyzw[i].f[2],
484                   tmp->xyzw[i].f[3]);
485   }
486}
487#endif
488
489
490/**
491 * Check if there's a potential src/dst register data dependency when
492 * using SOA execution.
493 * Example:
494 *   MOV T, T.yxwz;
495 * This would expand into:
496 *   MOV t0, t1;
497 *   MOV t1, t0;
498 *   MOV t2, t3;
499 *   MOV t3, t2;
500 * The second instruction will have the wrong value for t0 if executed as-is.
501 */
502boolean
503tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
504{
505   uint i, chan;
506
507   uint writemask = inst->Dst[0].Register.WriteMask;
508   if (writemask == TGSI_WRITEMASK_X ||
509       writemask == TGSI_WRITEMASK_Y ||
510       writemask == TGSI_WRITEMASK_Z ||
511       writemask == TGSI_WRITEMASK_W ||
512       writemask == TGSI_WRITEMASK_NONE) {
513      /* no chance of data dependency */
514      return FALSE;
515   }
516
517   /* loop over src regs */
518   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
519      if ((inst->Src[i].Register.File ==
520           inst->Dst[0].Register.File) &&
521          (inst->Src[i].Register.Index ==
522           inst->Dst[0].Register.Index)) {
523         /* loop over dest channels */
524         uint channelsWritten = 0x0;
525         FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
526            /* check if we're reading a channel that's been written */
527            uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
528            if (channelsWritten & (1 << swizzle)) {
529               return TRUE;
530            }
531
532            channelsWritten |= (1 << chan);
533         }
534      }
535   }
536   return FALSE;
537}
538
539
540/**
541 * Initialize machine state by expanding tokens to full instructions,
542 * allocating temporary storage, setting up constants, etc.
543 * After this, we can call tgsi_exec_machine_run() many times.
544 */
545void
546tgsi_exec_machine_bind_shader(
547   struct tgsi_exec_machine *mach,
548   const struct tgsi_token *tokens,
549   uint numSamplers,
550   struct tgsi_sampler **samplers)
551{
552   uint k;
553   struct tgsi_parse_context parse;
554   struct tgsi_exec_labels *labels = &mach->Labels;
555   struct tgsi_full_instruction *instructions;
556   struct tgsi_full_declaration *declarations;
557   uint maxInstructions = 10, numInstructions = 0;
558   uint maxDeclarations = 10, numDeclarations = 0;
559   uint instno = 0;
560
561#if 0
562   tgsi_dump(tokens, 0);
563#endif
564
565   util_init_math();
566
567   mach->Tokens = tokens;
568   mach->Samplers = samplers;
569
570   k = tgsi_parse_init (&parse, mach->Tokens);
571   if (k != TGSI_PARSE_OK) {
572      debug_printf( "Problem parsing!\n" );
573      return;
574   }
575
576   mach->Processor = parse.FullHeader.Processor.Processor;
577   mach->ImmLimit = 0;
578   labels->count = 0;
579
580   declarations = (struct tgsi_full_declaration *)
581      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
582
583   if (!declarations) {
584      return;
585   }
586
587   instructions = (struct tgsi_full_instruction *)
588      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
589
590   if (!instructions) {
591      FREE( declarations );
592      return;
593   }
594
595   while( !tgsi_parse_end_of_tokens( &parse ) ) {
596      uint pointer = parse.Position;
597      uint i;
598
599      tgsi_parse_token( &parse );
600      switch( parse.FullToken.Token.Type ) {
601      case TGSI_TOKEN_TYPE_DECLARATION:
602         /* save expanded declaration */
603         if (numDeclarations == maxDeclarations) {
604            declarations = REALLOC(declarations,
605                                   maxDeclarations
606                                   * sizeof(struct tgsi_full_declaration),
607                                   (maxDeclarations + 10)
608                                   * sizeof(struct tgsi_full_declaration));
609            maxDeclarations += 10;
610         }
611         if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
612            unsigned reg;
613            for (reg = parse.FullToken.FullDeclaration.Range.First;
614                 reg <= parse.FullToken.FullDeclaration.Range.Last;
615                 ++reg) {
616               ++mach->NumOutputs;
617            }
618         }
619         memcpy(declarations + numDeclarations,
620                &parse.FullToken.FullDeclaration,
621                sizeof(declarations[0]));
622         numDeclarations++;
623         break;
624
625      case TGSI_TOKEN_TYPE_IMMEDIATE:
626         {
627            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
628            assert( size <= 4 );
629            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
630
631            for( i = 0; i < size; i++ ) {
632               mach->Imms[mach->ImmLimit][i] =
633		  parse.FullToken.FullImmediate.u[i].Float;
634            }
635            mach->ImmLimit += 1;
636         }
637         break;
638
639      case TGSI_TOKEN_TYPE_INSTRUCTION:
640         assert( labels->count < MAX_LABELS );
641
642         labels->labels[labels->count][0] = instno;
643         labels->labels[labels->count][1] = pointer;
644         labels->count++;
645
646         /* save expanded instruction */
647         if (numInstructions == maxInstructions) {
648            instructions = REALLOC(instructions,
649                                   maxInstructions
650                                   * sizeof(struct tgsi_full_instruction),
651                                   (maxInstructions + 10)
652                                   * sizeof(struct tgsi_full_instruction));
653            maxInstructions += 10;
654         }
655
656         memcpy(instructions + numInstructions,
657                &parse.FullToken.FullInstruction,
658                sizeof(instructions[0]));
659
660         numInstructions++;
661         break;
662
663      case TGSI_TOKEN_TYPE_PROPERTY:
664         break;
665
666      default:
667         assert( 0 );
668      }
669   }
670   tgsi_parse_free (&parse);
671
672   if (mach->Declarations) {
673      FREE( mach->Declarations );
674   }
675   mach->Declarations = declarations;
676   mach->NumDeclarations = numDeclarations;
677
678   if (mach->Instructions) {
679      FREE( mach->Instructions );
680   }
681   mach->Instructions = instructions;
682   mach->NumInstructions = numInstructions;
683}
684
685
686struct tgsi_exec_machine *
687tgsi_exec_machine_create( void )
688{
689   struct tgsi_exec_machine *mach;
690   uint i;
691
692   mach = align_malloc( sizeof *mach, 16 );
693   if (!mach)
694      goto fail;
695
696   memset(mach, 0, sizeof(*mach));
697
698   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
699   mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
700   mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
701
702   /* Setup constants. */
703   for( i = 0; i < 4; i++ ) {
704      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
705      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
706      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
707      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
708      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
709      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
710      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
711      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
712      mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
713      mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
714   }
715
716#ifdef DEBUG
717   /* silence warnings */
718   (void) print_chan;
719   (void) print_temp;
720#endif
721
722   return mach;
723
724fail:
725   align_free(mach);
726   return NULL;
727}
728
729
730void
731tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
732{
733   if (mach) {
734      FREE(mach->Instructions);
735      FREE(mach->Declarations);
736   }
737
738   align_free(mach);
739}
740
741static void
742micro_add(
743   union tgsi_exec_channel *dst,
744   const union tgsi_exec_channel *src0,
745   const union tgsi_exec_channel *src1 )
746{
747   dst->f[0] = src0->f[0] + src1->f[0];
748   dst->f[1] = src0->f[1] + src1->f[1];
749   dst->f[2] = src0->f[2] + src1->f[2];
750   dst->f[3] = src0->f[3] + src1->f[3];
751}
752
753static void
754micro_div(
755   union tgsi_exec_channel *dst,
756   const union tgsi_exec_channel *src0,
757   const union tgsi_exec_channel *src1 )
758{
759   if (src1->f[0] != 0) {
760      dst->f[0] = src0->f[0] / src1->f[0];
761   }
762   if (src1->f[1] != 0) {
763      dst->f[1] = src0->f[1] / src1->f[1];
764   }
765   if (src1->f[2] != 0) {
766      dst->f[2] = src0->f[2] / src1->f[2];
767   }
768   if (src1->f[3] != 0) {
769      dst->f[3] = src0->f[3] / src1->f[3];
770   }
771}
772
773static void
774micro_float_clamp(union tgsi_exec_channel *dst,
775                  const union tgsi_exec_channel *src)
776{
777   uint i;
778
779   for (i = 0; i < 4; i++) {
780      if (src->f[i] > 0.0f) {
781         if (src->f[i] > 1.884467e+019f)
782            dst->f[i] = 1.884467e+019f;
783         else if (src->f[i] < 5.42101e-020f)
784            dst->f[i] = 5.42101e-020f;
785         else
786            dst->f[i] = src->f[i];
787      }
788      else {
789         if (src->f[i] < -1.884467e+019f)
790            dst->f[i] = -1.884467e+019f;
791         else if (src->f[i] > -5.42101e-020f)
792            dst->f[i] = -5.42101e-020f;
793         else
794            dst->f[i] = src->f[i];
795      }
796   }
797}
798
799static void
800micro_lt(
801   union tgsi_exec_channel *dst,
802   const union tgsi_exec_channel *src0,
803   const union tgsi_exec_channel *src1,
804   const union tgsi_exec_channel *src2,
805   const union tgsi_exec_channel *src3 )
806{
807   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
808   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
809   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
810   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
811}
812
813static void
814micro_max(
815   union tgsi_exec_channel *dst,
816   const union tgsi_exec_channel *src0,
817   const union tgsi_exec_channel *src1 )
818{
819   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
820   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
821   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
822   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
823}
824
825static void
826micro_min(
827   union tgsi_exec_channel *dst,
828   const union tgsi_exec_channel *src0,
829   const union tgsi_exec_channel *src1 )
830{
831   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
832   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
833   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
834   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
835}
836
837static void
838micro_mul(
839   union tgsi_exec_channel *dst,
840   const union tgsi_exec_channel *src0,
841   const union tgsi_exec_channel *src1 )
842{
843   dst->f[0] = src0->f[0] * src1->f[0];
844   dst->f[1] = src0->f[1] * src1->f[1];
845   dst->f[2] = src0->f[2] * src1->f[2];
846   dst->f[3] = src0->f[3] * src1->f[3];
847}
848
849#if 0
850static void
851micro_imul64(
852   union tgsi_exec_channel *dst0,
853   union tgsi_exec_channel *dst1,
854   const union tgsi_exec_channel *src0,
855   const union tgsi_exec_channel *src1 )
856{
857   dst1->i[0] = src0->i[0] * src1->i[0];
858   dst1->i[1] = src0->i[1] * src1->i[1];
859   dst1->i[2] = src0->i[2] * src1->i[2];
860   dst1->i[3] = src0->i[3] * src1->i[3];
861   dst0->i[0] = 0;
862   dst0->i[1] = 0;
863   dst0->i[2] = 0;
864   dst0->i[3] = 0;
865}
866#endif
867
868#if 0
869static void
870micro_umul64(
871   union tgsi_exec_channel *dst0,
872   union tgsi_exec_channel *dst1,
873   const union tgsi_exec_channel *src0,
874   const union tgsi_exec_channel *src1 )
875{
876   dst1->u[0] = src0->u[0] * src1->u[0];
877   dst1->u[1] = src0->u[1] * src1->u[1];
878   dst1->u[2] = src0->u[2] * src1->u[2];
879   dst1->u[3] = src0->u[3] * src1->u[3];
880   dst0->u[0] = 0;
881   dst0->u[1] = 0;
882   dst0->u[2] = 0;
883   dst0->u[3] = 0;
884}
885#endif
886
887
888#if 0
889static void
890micro_movc(
891   union tgsi_exec_channel *dst,
892   const union tgsi_exec_channel *src0,
893   const union tgsi_exec_channel *src1,
894   const union tgsi_exec_channel *src2 )
895{
896   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
897   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
898   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
899   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
900}
901#endif
902
903static void
904micro_neg(
905   union tgsi_exec_channel *dst,
906   const union tgsi_exec_channel *src )
907{
908   dst->f[0] = -src->f[0];
909   dst->f[1] = -src->f[1];
910   dst->f[2] = -src->f[2];
911   dst->f[3] = -src->f[3];
912}
913
914static void
915micro_pow(
916   union tgsi_exec_channel *dst,
917   const union tgsi_exec_channel *src0,
918   const union tgsi_exec_channel *src1 )
919{
920#if FAST_MATH
921   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
922   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
923   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
924   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
925#else
926   dst->f[0] = powf( src0->f[0], src1->f[0] );
927   dst->f[1] = powf( src0->f[1], src1->f[1] );
928   dst->f[2] = powf( src0->f[2], src1->f[2] );
929   dst->f[3] = powf( src0->f[3], src1->f[3] );
930#endif
931}
932
933static void
934micro_sqrt( union tgsi_exec_channel *dst,
935            const union tgsi_exec_channel *src )
936{
937   dst->f[0] = sqrtf( src->f[0] );
938   dst->f[1] = sqrtf( src->f[1] );
939   dst->f[2] = sqrtf( src->f[2] );
940   dst->f[3] = sqrtf( src->f[3] );
941}
942
943static void
944micro_sub(
945   union tgsi_exec_channel *dst,
946   const union tgsi_exec_channel *src0,
947   const union tgsi_exec_channel *src1 )
948{
949   dst->f[0] = src0->f[0] - src1->f[0];
950   dst->f[1] = src0->f[1] - src1->f[1];
951   dst->f[2] = src0->f[2] - src1->f[2];
952   dst->f[3] = src0->f[3] - src1->f[3];
953}
954
955static void
956fetch_src_file_channel(const struct tgsi_exec_machine *mach,
957                       const uint file,
958                       const uint swizzle,
959                       const union tgsi_exec_channel *index,
960                       const union tgsi_exec_channel *index2D,
961                       union tgsi_exec_channel *chan)
962{
963   uint i;
964
965   switch (file) {
966   case TGSI_FILE_CONSTANT:
967      for (i = 0; i < QUAD_SIZE; i++) {
968         assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
969         assert(mach->Consts[index2D->i[i]]);
970
971         if (index->i[i] < 0) {
972            chan->u[i] = 0;
973         } else {
974            const uint *p = (const uint *)mach->Consts[index2D->i[i]];
975
976            chan->u[i] = p[index->i[i] * 4 + swizzle];
977         }
978      }
979      break;
980
981   case TGSI_FILE_INPUT:
982   case TGSI_FILE_SYSTEM_VALUE:
983      for (i = 0; i < QUAD_SIZE; i++) {
984         /* XXX: 2D indexing */
985         chan->u[i] = mach->Inputs[index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i]].xyzw[swizzle].u[i];
986      }
987      break;
988
989   case TGSI_FILE_TEMPORARY:
990      for (i = 0; i < QUAD_SIZE; i++) {
991         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
992         assert(index2D->i[i] == 0);
993
994         chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
995      }
996      break;
997
998   case TGSI_FILE_IMMEDIATE:
999      for (i = 0; i < QUAD_SIZE; i++) {
1000         assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1001         assert(index2D->i[i] == 0);
1002
1003         chan->f[i] = mach->Imms[index->i[i]][swizzle];
1004      }
1005      break;
1006
1007   case TGSI_FILE_ADDRESS:
1008      for (i = 0; i < QUAD_SIZE; i++) {
1009         assert(index->i[i] >= 0);
1010         assert(index2D->i[i] == 0);
1011
1012         chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1013      }
1014      break;
1015
1016   case TGSI_FILE_PREDICATE:
1017      for (i = 0; i < QUAD_SIZE; i++) {
1018         assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
1019         assert(index2D->i[i] == 0);
1020
1021         chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
1022      }
1023      break;
1024
1025   case TGSI_FILE_OUTPUT:
1026      /* vertex/fragment output vars can be read too */
1027      for (i = 0; i < QUAD_SIZE; i++) {
1028         assert(index->i[i] >= 0);
1029         assert(index2D->i[i] == 0);
1030
1031         chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1032      }
1033      break;
1034
1035   default:
1036      assert(0);
1037      for (i = 0; i < QUAD_SIZE; i++) {
1038         chan->u[i] = 0;
1039      }
1040   }
1041}
1042
1043static void
1044fetch_source(const struct tgsi_exec_machine *mach,
1045             union tgsi_exec_channel *chan,
1046             const struct tgsi_full_src_register *reg,
1047             const uint chan_index,
1048             enum tgsi_exec_datatype src_datatype)
1049{
1050   union tgsi_exec_channel index;
1051   union tgsi_exec_channel index2D;
1052   uint swizzle;
1053
1054   /* We start with a direct index into a register file.
1055    *
1056    *    file[1],
1057    *    where:
1058    *       file = Register.File
1059    *       [1] = Register.Index
1060    */
1061   index.i[0] =
1062   index.i[1] =
1063   index.i[2] =
1064   index.i[3] = reg->Register.Index;
1065
1066   /* There is an extra source register that indirectly subscripts
1067    * a register file. The direct index now becomes an offset
1068    * that is being added to the indirect register.
1069    *
1070    *    file[ind[2].x+1],
1071    *    where:
1072    *       ind = Indirect.File
1073    *       [2] = Indirect.Index
1074    *       .x = Indirect.SwizzleX
1075    */
1076   if (reg->Register.Indirect) {
1077      union tgsi_exec_channel index2;
1078      union tgsi_exec_channel indir_index;
1079      const uint execmask = mach->ExecMask;
1080      uint i;
1081
1082      /* which address register (always zero now) */
1083      index2.i[0] =
1084      index2.i[1] =
1085      index2.i[2] =
1086      index2.i[3] = reg->Indirect.Index;
1087
1088      /* get current value of address register[swizzle] */
1089      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1090      fetch_src_file_channel(mach,
1091                             reg->Indirect.File,
1092                             swizzle,
1093                             &index2,
1094                             &ZeroVec,
1095                             &indir_index);
1096
1097      /* add value of address register to the offset */
1098      index.i[0] += indir_index.i[0];
1099      index.i[1] += indir_index.i[1];
1100      index.i[2] += indir_index.i[2];
1101      index.i[3] += indir_index.i[3];
1102
1103      /* for disabled execution channels, zero-out the index to
1104       * avoid using a potential garbage value.
1105       */
1106      for (i = 0; i < QUAD_SIZE; i++) {
1107         if ((execmask & (1 << i)) == 0)
1108            index.i[i] = 0;
1109      }
1110   }
1111
1112   /* There is an extra source register that is a second
1113    * subscript to a register file. Effectively it means that
1114    * the register file is actually a 2D array of registers.
1115    *
1116    *    file[3][1],
1117    *    where:
1118    *       [3] = Dimension.Index
1119    */
1120   if (reg->Register.Dimension) {
1121      index2D.i[0] =
1122      index2D.i[1] =
1123      index2D.i[2] =
1124      index2D.i[3] = reg->Dimension.Index;
1125
1126      /* Again, the second subscript index can be addressed indirectly
1127       * identically to the first one.
1128       * Nothing stops us from indirectly addressing the indirect register,
1129       * but there is no need for that, so we won't exercise it.
1130       *
1131       *    file[ind[4].y+3][1],
1132       *    where:
1133       *       ind = DimIndirect.File
1134       *       [4] = DimIndirect.Index
1135       *       .y = DimIndirect.SwizzleX
1136       */
1137      if (reg->Dimension.Indirect) {
1138         union tgsi_exec_channel index2;
1139         union tgsi_exec_channel indir_index;
1140         const uint execmask = mach->ExecMask;
1141         uint i;
1142
1143         index2.i[0] =
1144         index2.i[1] =
1145         index2.i[2] =
1146         index2.i[3] = reg->DimIndirect.Index;
1147
1148         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1149         fetch_src_file_channel(mach,
1150                                reg->DimIndirect.File,
1151                                swizzle,
1152                                &index2,
1153                                &ZeroVec,
1154                                &indir_index);
1155
1156         index2D.i[0] += indir_index.i[0];
1157         index2D.i[1] += indir_index.i[1];
1158         index2D.i[2] += indir_index.i[2];
1159         index2D.i[3] += indir_index.i[3];
1160
1161         /* for disabled execution channels, zero-out the index to
1162          * avoid using a potential garbage value.
1163          */
1164         for (i = 0; i < QUAD_SIZE; i++) {
1165            if ((execmask & (1 << i)) == 0) {
1166               index2D.i[i] = 0;
1167            }
1168         }
1169      }
1170
1171      /* If by any chance there was a need for a 3D array of register
1172       * files, we would have to check whether Dimension is followed
1173       * by a dimension register and continue the saga.
1174       */
1175   } else {
1176      index2D.i[0] =
1177      index2D.i[1] =
1178      index2D.i[2] =
1179      index2D.i[3] = 0;
1180   }
1181
1182   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1183   fetch_src_file_channel(mach,
1184                          reg->Register.File,
1185                          swizzle,
1186                          &index,
1187                          &index2D,
1188                          chan);
1189
1190   if (reg->Register.Absolute) {
1191      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1192         micro_abs(chan, chan);
1193      } else {
1194         micro_iabs(chan, chan);
1195      }
1196   }
1197
1198   if (reg->Register.Negate) {
1199      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1200         micro_neg(chan, chan);
1201      } else {
1202         micro_ineg(chan, chan);
1203      }
1204   }
1205}
1206
1207static void
1208store_dest(struct tgsi_exec_machine *mach,
1209           const union tgsi_exec_channel *chan,
1210           const struct tgsi_full_dst_register *reg,
1211           const struct tgsi_full_instruction *inst,
1212           uint chan_index,
1213           enum tgsi_exec_datatype dst_datatype)
1214{
1215   uint i;
1216   union tgsi_exec_channel null;
1217   union tgsi_exec_channel *dst;
1218   uint execmask = mach->ExecMask;
1219   int offset = 0;  /* indirection offset */
1220   int index;
1221
1222   if (dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1223      CHECK_INF_OR_NAN(chan);
1224   }
1225
1226   /* There is an extra source register that indirectly subscripts
1227    * a register file. The direct index now becomes an offset
1228    * that is being added to the indirect register.
1229    *
1230    *    file[ind[2].x+1],
1231    *    where:
1232    *       ind = Indirect.File
1233    *       [2] = Indirect.Index
1234    *       .x = Indirect.SwizzleX
1235    */
1236   if (reg->Register.Indirect) {
1237      union tgsi_exec_channel index;
1238      union tgsi_exec_channel indir_index;
1239      uint swizzle;
1240
1241      /* which address register (always zero for now) */
1242      index.i[0] =
1243      index.i[1] =
1244      index.i[2] =
1245      index.i[3] = reg->Indirect.Index;
1246
1247      /* get current value of address register[swizzle] */
1248      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1249
1250      /* fetch values from the address/indirection register */
1251      fetch_src_file_channel(mach,
1252                             reg->Indirect.File,
1253                             swizzle,
1254                             &index,
1255                             &ZeroVec,
1256                             &indir_index);
1257
1258      /* save indirection offset */
1259      offset = indir_index.i[0];
1260   }
1261
1262   switch (reg->Register.File) {
1263   case TGSI_FILE_NULL:
1264      dst = &null;
1265      break;
1266
1267   case TGSI_FILE_OUTPUT:
1268      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1269         + reg->Register.Index;
1270      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1271#if 0
1272      if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1273         fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1274         for (i = 0; i < QUAD_SIZE; i++)
1275            if (execmask & (1 << i))
1276               fprintf(stderr, "%f, ", chan->f[i]);
1277         fprintf(stderr, ")\n");
1278      }
1279#endif
1280      break;
1281
1282   case TGSI_FILE_TEMPORARY:
1283      index = reg->Register.Index;
1284      assert( index < TGSI_EXEC_NUM_TEMPS );
1285      dst = &mach->Temps[offset + index].xyzw[chan_index];
1286      break;
1287
1288   case TGSI_FILE_ADDRESS:
1289      index = reg->Register.Index;
1290      dst = &mach->Addrs[index].xyzw[chan_index];
1291      break;
1292
1293   case TGSI_FILE_LOOP:
1294      assert(reg->Register.Index == 0);
1295      assert(mach->LoopCounterStackTop > 0);
1296      assert(chan_index == CHAN_X);
1297      dst = &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[chan_index];
1298      break;
1299
1300   case TGSI_FILE_PREDICATE:
1301      index = reg->Register.Index;
1302      assert(index < TGSI_EXEC_NUM_PREDS);
1303      dst = &mach->Predicates[index].xyzw[chan_index];
1304      break;
1305
1306   default:
1307      assert( 0 );
1308      return;
1309   }
1310
1311   if (inst->Instruction.Predicate) {
1312      uint swizzle;
1313      union tgsi_exec_channel *pred;
1314
1315      switch (chan_index) {
1316      case CHAN_X:
1317         swizzle = inst->Predicate.SwizzleX;
1318         break;
1319      case CHAN_Y:
1320         swizzle = inst->Predicate.SwizzleY;
1321         break;
1322      case CHAN_Z:
1323         swizzle = inst->Predicate.SwizzleZ;
1324         break;
1325      case CHAN_W:
1326         swizzle = inst->Predicate.SwizzleW;
1327         break;
1328      default:
1329         assert(0);
1330         return;
1331      }
1332
1333      assert(inst->Predicate.Index == 0);
1334
1335      pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1336
1337      if (inst->Predicate.Negate) {
1338         for (i = 0; i < QUAD_SIZE; i++) {
1339            if (pred->u[i]) {
1340               execmask &= ~(1 << i);
1341            }
1342         }
1343      } else {
1344         for (i = 0; i < QUAD_SIZE; i++) {
1345            if (!pred->u[i]) {
1346               execmask &= ~(1 << i);
1347            }
1348         }
1349      }
1350   }
1351
1352   switch (inst->Instruction.Saturate) {
1353   case TGSI_SAT_NONE:
1354      for (i = 0; i < QUAD_SIZE; i++)
1355         if (execmask & (1 << i))
1356            dst->i[i] = chan->i[i];
1357      break;
1358
1359   case TGSI_SAT_ZERO_ONE:
1360      for (i = 0; i < QUAD_SIZE; i++)
1361         if (execmask & (1 << i)) {
1362            if (chan->f[i] < 0.0f)
1363               dst->f[i] = 0.0f;
1364            else if (chan->f[i] > 1.0f)
1365               dst->f[i] = 1.0f;
1366            else
1367               dst->i[i] = chan->i[i];
1368         }
1369      break;
1370
1371   case TGSI_SAT_MINUS_PLUS_ONE:
1372      for (i = 0; i < QUAD_SIZE; i++)
1373         if (execmask & (1 << i)) {
1374            if (chan->f[i] < -1.0f)
1375               dst->f[i] = -1.0f;
1376            else if (chan->f[i] > 1.0f)
1377               dst->f[i] = 1.0f;
1378            else
1379               dst->i[i] = chan->i[i];
1380         }
1381      break;
1382
1383   default:
1384      assert( 0 );
1385   }
1386}
1387
1388#define FETCH(VAL,INDEX,CHAN)\
1389    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1390
1391#define STORE(VAL,INDEX,CHAN)\
1392   store_dest(mach, VAL, &inst->Dst[INDEX], inst, CHAN, TGSI_EXEC_DATA_FLOAT)
1393
1394
1395/**
1396 * Execute ARB-style KIL which is predicated by a src register.
1397 * Kill fragment if any of the four values is less than zero.
1398 */
1399static void
1400exec_kil(struct tgsi_exec_machine *mach,
1401         const struct tgsi_full_instruction *inst)
1402{
1403   uint uniquemask;
1404   uint chan_index;
1405   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1406   union tgsi_exec_channel r[1];
1407
1408   /* This mask stores component bits that were already tested. */
1409   uniquemask = 0;
1410
1411   for (chan_index = 0; chan_index < 4; chan_index++)
1412   {
1413      uint swizzle;
1414      uint i;
1415
1416      /* unswizzle channel */
1417      swizzle = tgsi_util_get_full_src_register_swizzle (
1418                        &inst->Src[0],
1419                        chan_index);
1420
1421      /* check if the component has not been already tested */
1422      if (uniquemask & (1 << swizzle))
1423         continue;
1424      uniquemask |= 1 << swizzle;
1425
1426      FETCH(&r[0], 0, chan_index);
1427      for (i = 0; i < 4; i++)
1428         if (r[0].f[i] < 0.0f)
1429            kilmask |= 1 << i;
1430   }
1431
1432   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1433}
1434
1435/**
1436 * Execute NVIDIA-style KIL which is predicated by a condition code.
1437 * Kill fragment if the condition code is TRUE.
1438 */
1439static void
1440exec_kilp(struct tgsi_exec_machine *mach,
1441          const struct tgsi_full_instruction *inst)
1442{
1443   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1444
1445   /* "unconditional" kil */
1446   kilmask = mach->ExecMask;
1447   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1448}
1449
1450static void
1451emit_vertex(struct tgsi_exec_machine *mach)
1452{
1453   /* FIXME: check for exec mask correctly
1454   unsigned i;
1455   for (i = 0; i < QUAD_SIZE; ++i) {
1456         if ((mach->ExecMask & (1 << i)))
1457   */
1458   if (mach->ExecMask) {
1459      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1460      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1461   }
1462}
1463
1464static void
1465emit_primitive(struct tgsi_exec_machine *mach)
1466{
1467   unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1468   /* FIXME: check for exec mask correctly
1469   unsigned i;
1470   for (i = 0; i < QUAD_SIZE; ++i) {
1471         if ((mach->ExecMask & (1 << i)))
1472   */
1473   if (mach->ExecMask) {
1474      ++(*prim_count);
1475      debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1476      mach->Primitives[*prim_count] = 0;
1477   }
1478}
1479
1480/*
1481 * Fetch a four texture samples using STR texture coordinates.
1482 */
1483static void
1484fetch_texel( struct tgsi_sampler *sampler,
1485             const union tgsi_exec_channel *s,
1486             const union tgsi_exec_channel *t,
1487             const union tgsi_exec_channel *p,
1488             const union tgsi_exec_channel *c0,
1489             enum tgsi_sampler_control control,
1490             union tgsi_exec_channel *r,
1491             union tgsi_exec_channel *g,
1492             union tgsi_exec_channel *b,
1493             union tgsi_exec_channel *a )
1494{
1495   uint j;
1496   float rgba[NUM_CHANNELS][QUAD_SIZE];
1497
1498   sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
1499
1500   for (j = 0; j < 4; j++) {
1501      r->f[j] = rgba[0][j];
1502      g->f[j] = rgba[1][j];
1503      b->f[j] = rgba[2][j];
1504      a->f[j] = rgba[3][j];
1505   }
1506}
1507
1508
1509#define TEX_MODIFIER_NONE           0
1510#define TEX_MODIFIER_PROJECTED      1
1511#define TEX_MODIFIER_LOD_BIAS       2
1512#define TEX_MODIFIER_EXPLICIT_LOD   3
1513
1514
1515static void
1516exec_tex(struct tgsi_exec_machine *mach,
1517         const struct tgsi_full_instruction *inst,
1518         uint modifier)
1519{
1520   const uint unit = inst->Src[1].Register.Index;
1521   union tgsi_exec_channel r[4];
1522   const union tgsi_exec_channel *lod = &ZeroVec;
1523   enum tgsi_sampler_control control;
1524   uint chan_index;
1525
1526   if (modifier != TEX_MODIFIER_NONE) {
1527      FETCH(&r[3], 0, CHAN_W);
1528      if (modifier != TEX_MODIFIER_PROJECTED) {
1529         lod = &r[3];
1530      }
1531   }
1532
1533   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
1534      control = tgsi_sampler_lod_explicit;
1535   } else {
1536      control = tgsi_sampler_lod_bias;
1537   }
1538
1539   switch (inst->Texture.Texture) {
1540   case TGSI_TEXTURE_1D:
1541   case TGSI_TEXTURE_SHADOW1D:
1542      FETCH(&r[0], 0, CHAN_X);
1543
1544      if (modifier == TEX_MODIFIER_PROJECTED) {
1545         micro_div(&r[0], &r[0], &r[3]);
1546      }
1547
1548      fetch_texel(mach->Samplers[unit],
1549                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
1550                  control,
1551                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1552      break;
1553
1554   case TGSI_TEXTURE_2D:
1555   case TGSI_TEXTURE_RECT:
1556   case TGSI_TEXTURE_SHADOW2D:
1557   case TGSI_TEXTURE_SHADOWRECT:
1558      FETCH(&r[0], 0, CHAN_X);
1559      FETCH(&r[1], 0, CHAN_Y);
1560      FETCH(&r[2], 0, CHAN_Z);
1561
1562      if (modifier == TEX_MODIFIER_PROJECTED) {
1563         micro_div(&r[0], &r[0], &r[3]);
1564         micro_div(&r[1], &r[1], &r[3]);
1565         micro_div(&r[2], &r[2], &r[3]);
1566      }
1567
1568      fetch_texel(mach->Samplers[unit],
1569                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1570                  control,
1571                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1572      break;
1573
1574   case TGSI_TEXTURE_3D:
1575   case TGSI_TEXTURE_CUBE:
1576      FETCH(&r[0], 0, CHAN_X);
1577      FETCH(&r[1], 0, CHAN_Y);
1578      FETCH(&r[2], 0, CHAN_Z);
1579
1580      if (modifier == TEX_MODIFIER_PROJECTED) {
1581         micro_div(&r[0], &r[0], &r[3]);
1582         micro_div(&r[1], &r[1], &r[3]);
1583         micro_div(&r[2], &r[2], &r[3]);
1584      }
1585
1586      fetch_texel(mach->Samplers[unit],
1587                  &r[0], &r[1], &r[2], lod,
1588                  control,
1589                  &r[0], &r[1], &r[2], &r[3]);
1590      break;
1591
1592   default:
1593      assert(0);
1594   }
1595
1596   FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1597      STORE(&r[chan_index], 0, chan_index);
1598   }
1599}
1600
1601static void
1602exec_txd(struct tgsi_exec_machine *mach,
1603         const struct tgsi_full_instruction *inst)
1604{
1605   const uint unit = inst->Src[3].Register.Index;
1606   union tgsi_exec_channel r[4];
1607   uint chan_index;
1608
1609   /*
1610    * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1611    */
1612
1613   switch (inst->Texture.Texture) {
1614   case TGSI_TEXTURE_1D:
1615   case TGSI_TEXTURE_SHADOW1D:
1616
1617      FETCH(&r[0], 0, CHAN_X);
1618
1619      fetch_texel(mach->Samplers[unit],
1620                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
1621                  tgsi_sampler_lod_bias,
1622                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
1623      break;
1624
1625   case TGSI_TEXTURE_2D:
1626   case TGSI_TEXTURE_RECT:
1627   case TGSI_TEXTURE_SHADOW2D:
1628   case TGSI_TEXTURE_SHADOWRECT:
1629
1630      FETCH(&r[0], 0, CHAN_X);
1631      FETCH(&r[1], 0, CHAN_Y);
1632      FETCH(&r[2], 0, CHAN_Z);
1633
1634      fetch_texel(mach->Samplers[unit],
1635                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
1636                  tgsi_sampler_lod_bias,
1637                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
1638      break;
1639
1640   case TGSI_TEXTURE_3D:
1641   case TGSI_TEXTURE_CUBE:
1642
1643      FETCH(&r[0], 0, CHAN_X);
1644      FETCH(&r[1], 0, CHAN_Y);
1645      FETCH(&r[2], 0, CHAN_Z);
1646
1647      fetch_texel(mach->Samplers[unit],
1648                  &r[0], &r[1], &r[2], &ZeroVec,
1649                  tgsi_sampler_lod_bias,
1650                  &r[0], &r[1], &r[2], &r[3]);
1651      break;
1652
1653   default:
1654      assert(0);
1655   }
1656
1657   FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1658      STORE(&r[chan_index], 0, chan_index);
1659   }
1660}
1661
1662
1663/**
1664 * Evaluate a constant-valued coefficient at the position of the
1665 * current quad.
1666 */
1667static void
1668eval_constant_coef(
1669   struct tgsi_exec_machine *mach,
1670   unsigned attrib,
1671   unsigned chan )
1672{
1673   unsigned i;
1674
1675   for( i = 0; i < QUAD_SIZE; i++ ) {
1676      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1677   }
1678}
1679
1680/**
1681 * Evaluate a linear-valued coefficient at the position of the
1682 * current quad.
1683 */
1684static void
1685eval_linear_coef(
1686   struct tgsi_exec_machine *mach,
1687   unsigned attrib,
1688   unsigned chan )
1689{
1690   const float x = mach->QuadPos.xyzw[0].f[0];
1691   const float y = mach->QuadPos.xyzw[1].f[0];
1692   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1693   const float dady = mach->InterpCoefs[attrib].dady[chan];
1694   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1695   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1696   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1697   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1698   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1699}
1700
1701/**
1702 * Evaluate a perspective-valued coefficient at the position of the
1703 * current quad.
1704 */
1705static void
1706eval_perspective_coef(
1707   struct tgsi_exec_machine *mach,
1708   unsigned attrib,
1709   unsigned chan )
1710{
1711   const float x = mach->QuadPos.xyzw[0].f[0];
1712   const float y = mach->QuadPos.xyzw[1].f[0];
1713   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1714   const float dady = mach->InterpCoefs[attrib].dady[chan];
1715   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1716   const float *w = mach->QuadPos.xyzw[3].f;
1717   /* divide by W here */
1718   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1719   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1720   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1721   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1722}
1723
1724
1725typedef void (* eval_coef_func)(
1726   struct tgsi_exec_machine *mach,
1727   unsigned attrib,
1728   unsigned chan );
1729
1730static void
1731exec_declaration(struct tgsi_exec_machine *mach,
1732                 const struct tgsi_full_declaration *decl)
1733{
1734   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
1735      if (decl->Declaration.File == TGSI_FILE_INPUT ||
1736          decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1737         uint first, last, mask;
1738
1739         first = decl->Range.First;
1740         last = decl->Range.Last;
1741         mask = decl->Declaration.UsageMask;
1742
1743         if (decl->Semantic.Name == TGSI_SEMANTIC_POSITION) {
1744            assert(decl->Semantic.Index == 0);
1745            assert(first == last);
1746            assert(mask == TGSI_WRITEMASK_XYZW);
1747
1748            mach->Inputs[first] = mach->QuadPos;
1749         } else if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
1750            uint i;
1751
1752            assert(decl->Semantic.Index == 0);
1753            assert(first == last);
1754
1755            for (i = 0; i < QUAD_SIZE; i++) {
1756               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
1757            }
1758         } else {
1759            eval_coef_func eval;
1760            uint i, j;
1761
1762            switch (decl->Declaration.Interpolate) {
1763            case TGSI_INTERPOLATE_CONSTANT:
1764               eval = eval_constant_coef;
1765               break;
1766
1767            case TGSI_INTERPOLATE_LINEAR:
1768               eval = eval_linear_coef;
1769               break;
1770
1771            case TGSI_INTERPOLATE_PERSPECTIVE:
1772               eval = eval_perspective_coef;
1773               break;
1774
1775            default:
1776               assert(0);
1777               return;
1778            }
1779
1780            for (j = 0; j < NUM_CHANNELS; j++) {
1781               if (mask & (1 << j)) {
1782                  for (i = first; i <= last; i++) {
1783                     eval(mach, i, j);
1784                  }
1785               }
1786            }
1787         }
1788      }
1789   }
1790}
1791
1792typedef void (* micro_op)(union tgsi_exec_channel *dst,
1793                          const union tgsi_exec_channel *src);
1794
1795static void
1796exec_scalar_unary(struct tgsi_exec_machine *mach,
1797                  const struct tgsi_full_instruction *inst,
1798                  micro_op op,
1799                  enum tgsi_exec_datatype dst_datatype,
1800                  enum tgsi_exec_datatype src_datatype)
1801{
1802   unsigned int chan;
1803   union tgsi_exec_channel src;
1804   union tgsi_exec_channel dst;
1805
1806   fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
1807   op(&dst, &src);
1808   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1809      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1810         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
1811      }
1812   }
1813}
1814
1815static void
1816exec_vector_unary(struct tgsi_exec_machine *mach,
1817                  const struct tgsi_full_instruction *inst,
1818                  micro_op op,
1819                  enum tgsi_exec_datatype dst_datatype,
1820                  enum tgsi_exec_datatype src_datatype)
1821{
1822   unsigned int chan;
1823   struct tgsi_exec_vector dst;
1824
1825   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1826      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1827         union tgsi_exec_channel src;
1828
1829         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
1830         op(&dst.xyzw[chan], &src);
1831      }
1832   }
1833   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1834      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1835         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1836      }
1837   }
1838}
1839
1840static void
1841exec_vector_binary(struct tgsi_exec_machine *mach,
1842                   const struct tgsi_full_instruction *inst,
1843                   micro_op op,
1844                   enum tgsi_exec_datatype dst_datatype,
1845                   enum tgsi_exec_datatype src_datatype)
1846{
1847   unsigned int chan;
1848   struct tgsi_exec_vector dst;
1849
1850   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1851      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1852         union tgsi_exec_channel src[2];
1853
1854         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
1855         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
1856         op(&dst.xyzw[chan], src);
1857      }
1858   }
1859   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1860      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1861         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1862      }
1863   }
1864}
1865
1866static void
1867exec_vector_trinary(struct tgsi_exec_machine *mach,
1868                    const struct tgsi_full_instruction *inst,
1869                    micro_op op,
1870                    enum tgsi_exec_datatype dst_datatype,
1871                    enum tgsi_exec_datatype src_datatype)
1872{
1873   unsigned int chan;
1874   struct tgsi_exec_vector dst;
1875
1876   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1877      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1878         union tgsi_exec_channel src[3];
1879
1880         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
1881         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
1882         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
1883         op(&dst.xyzw[chan], src);
1884      }
1885   }
1886   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1887      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1888         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1889      }
1890   }
1891}
1892
1893static void
1894exec_dp3(struct tgsi_exec_machine *mach,
1895         const struct tgsi_full_instruction *inst)
1896{
1897   unsigned int chan;
1898   union tgsi_exec_channel arg[3];
1899
1900   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1901   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1902   micro_mul(&arg[2], &arg[0], &arg[1]);
1903
1904   for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
1905      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
1906      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
1907      micro_mad(&arg[2], arg);
1908   }
1909
1910   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1911      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1912         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1913      }
1914   }
1915}
1916
1917static void
1918exec_dp4(struct tgsi_exec_machine *mach,
1919         const struct tgsi_full_instruction *inst)
1920{
1921   unsigned int chan;
1922   union tgsi_exec_channel arg[3];
1923
1924   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1925   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1926   micro_mul(&arg[2], &arg[0], &arg[1]);
1927
1928   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
1929      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
1930      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
1931      micro_mad(&arg[2], arg);
1932   }
1933
1934   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1935      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1936         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1937      }
1938   }
1939}
1940
1941static void
1942exec_dp2a(struct tgsi_exec_machine *mach,
1943          const struct tgsi_full_instruction *inst)
1944{
1945   unsigned int chan;
1946   union tgsi_exec_channel arg[3];
1947
1948   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1949   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1950   micro_mul(&arg[2], &arg[0], &arg[1]);
1951
1952   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1953   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1954   micro_mad(&arg[0], arg);
1955
1956   fetch_source(mach, &arg[1], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1957   micro_add(&arg[0], &arg[0], &arg[1]);
1958
1959   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1960      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1961         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1962      }
1963   }
1964}
1965
1966static void
1967exec_dph(struct tgsi_exec_machine *mach,
1968         const struct tgsi_full_instruction *inst)
1969{
1970   unsigned int chan;
1971   union tgsi_exec_channel arg[3];
1972
1973   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1974   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1975   micro_mul(&arg[2], &arg[0], &arg[1]);
1976
1977   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1978   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1979   micro_mad(&arg[2], arg);
1980
1981   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
1982   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
1983   micro_mad(&arg[0], arg);
1984
1985   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
1986   micro_add(&arg[0], &arg[0], &arg[1]);
1987
1988   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1989      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1990         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1991      }
1992   }
1993}
1994
1995static void
1996exec_dp2(struct tgsi_exec_machine *mach,
1997         const struct tgsi_full_instruction *inst)
1998{
1999   unsigned int chan;
2000   union tgsi_exec_channel arg[3];
2001
2002   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2003   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2004   micro_mul(&arg[2], &arg[0], &arg[1]);
2005
2006   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2007   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2008   micro_mad(&arg[2], arg);
2009
2010   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2011      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2012         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2013      }
2014   }
2015}
2016
2017static void
2018exec_break(struct tgsi_exec_machine *mach)
2019{
2020   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
2021      /* turn off loop channels for each enabled exec channel */
2022      mach->LoopMask &= ~mach->ExecMask;
2023      /* Todo: if mach->LoopMask == 0, jump to end of loop */
2024      UPDATE_EXEC_MASK(mach);
2025   } else {
2026      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
2027
2028      mach->Switch.mask = 0x0;
2029
2030      UPDATE_EXEC_MASK(mach);
2031   }
2032}
2033
2034static void
2035exec_switch(struct tgsi_exec_machine *mach,
2036            const struct tgsi_full_instruction *inst)
2037{
2038   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
2039   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
2040
2041   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
2042   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2043   mach->Switch.mask = 0x0;
2044   mach->Switch.defaultMask = 0x0;
2045
2046   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
2047   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
2048
2049   UPDATE_EXEC_MASK(mach);
2050}
2051
2052static void
2053exec_case(struct tgsi_exec_machine *mach,
2054          const struct tgsi_full_instruction *inst)
2055{
2056   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2057   union tgsi_exec_channel src;
2058   uint mask = 0;
2059
2060   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2061
2062   if (mach->Switch.selector.u[0] == src.u[0]) {
2063      mask |= 0x1;
2064   }
2065   if (mach->Switch.selector.u[1] == src.u[1]) {
2066      mask |= 0x2;
2067   }
2068   if (mach->Switch.selector.u[2] == src.u[2]) {
2069      mask |= 0x4;
2070   }
2071   if (mach->Switch.selector.u[3] == src.u[3]) {
2072      mask |= 0x8;
2073   }
2074
2075   mach->Switch.defaultMask |= mask;
2076
2077   mach->Switch.mask |= mask & prevMask;
2078
2079   UPDATE_EXEC_MASK(mach);
2080}
2081
2082static void
2083exec_default(struct tgsi_exec_machine *mach)
2084{
2085   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2086
2087   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
2088
2089   UPDATE_EXEC_MASK(mach);
2090}
2091
2092static void
2093exec_endswitch(struct tgsi_exec_machine *mach)
2094{
2095   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
2096   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
2097
2098   UPDATE_EXEC_MASK(mach);
2099}
2100
2101static void
2102micro_i2f(union tgsi_exec_channel *dst,
2103          const union tgsi_exec_channel *src)
2104{
2105   dst->f[0] = (float)src->i[0];
2106   dst->f[1] = (float)src->i[1];
2107   dst->f[2] = (float)src->i[2];
2108   dst->f[3] = (float)src->i[3];
2109}
2110
2111static void
2112micro_not(union tgsi_exec_channel *dst,
2113          const union tgsi_exec_channel *src)
2114{
2115   dst->u[0] = ~src->u[0];
2116   dst->u[1] = ~src->u[1];
2117   dst->u[2] = ~src->u[2];
2118   dst->u[3] = ~src->u[3];
2119}
2120
2121static void
2122micro_shl(union tgsi_exec_channel *dst,
2123          const union tgsi_exec_channel *src)
2124{
2125   dst->u[0] = src[0].u[0] << src[1].u[0];
2126   dst->u[1] = src[0].u[1] << src[1].u[1];
2127   dst->u[2] = src[0].u[2] << src[1].u[2];
2128   dst->u[3] = src[0].u[3] << src[1].u[3];
2129}
2130
2131static void
2132micro_and(union tgsi_exec_channel *dst,
2133          const union tgsi_exec_channel *src)
2134{
2135   dst->u[0] = src[0].u[0] & src[1].u[0];
2136   dst->u[1] = src[0].u[1] & src[1].u[1];
2137   dst->u[2] = src[0].u[2] & src[1].u[2];
2138   dst->u[3] = src[0].u[3] & src[1].u[3];
2139}
2140
2141static void
2142micro_or(union tgsi_exec_channel *dst,
2143         const union tgsi_exec_channel *src)
2144{
2145   dst->u[0] = src[0].u[0] | src[1].u[0];
2146   dst->u[1] = src[0].u[1] | src[1].u[1];
2147   dst->u[2] = src[0].u[2] | src[1].u[2];
2148   dst->u[3] = src[0].u[3] | src[1].u[3];
2149}
2150
2151static void
2152micro_xor(union tgsi_exec_channel *dst,
2153          const union tgsi_exec_channel *src)
2154{
2155   dst->u[0] = src[0].u[0] ^ src[1].u[0];
2156   dst->u[1] = src[0].u[1] ^ src[1].u[1];
2157   dst->u[2] = src[0].u[2] ^ src[1].u[2];
2158   dst->u[3] = src[0].u[3] ^ src[1].u[3];
2159}
2160
2161static void
2162micro_f2i(union tgsi_exec_channel *dst,
2163          const union tgsi_exec_channel *src)
2164{
2165   dst->i[0] = (int)src->f[0];
2166   dst->i[1] = (int)src->f[1];
2167   dst->i[2] = (int)src->f[2];
2168   dst->i[3] = (int)src->f[3];
2169}
2170
2171static void
2172micro_idiv(union tgsi_exec_channel *dst,
2173           const union tgsi_exec_channel *src)
2174{
2175   dst->i[0] = src[0].i[0] / src[1].i[0];
2176   dst->i[1] = src[0].i[1] / src[1].i[1];
2177   dst->i[2] = src[0].i[2] / src[1].i[2];
2178   dst->i[3] = src[0].i[3] / src[1].i[3];
2179}
2180
2181static void
2182micro_imax(union tgsi_exec_channel *dst,
2183           const union tgsi_exec_channel *src)
2184{
2185   dst->i[0] = src[0].i[0] > src[1].i[0] ? src[0].i[0] : src[1].i[0];
2186   dst->i[1] = src[0].i[1] > src[1].i[1] ? src[0].i[1] : src[1].i[1];
2187   dst->i[2] = src[0].i[2] > src[1].i[2] ? src[0].i[2] : src[1].i[2];
2188   dst->i[3] = src[0].i[3] > src[1].i[3] ? src[0].i[3] : src[1].i[3];
2189}
2190
2191static void
2192micro_imin(union tgsi_exec_channel *dst,
2193           const union tgsi_exec_channel *src)
2194{
2195   dst->i[0] = src[0].i[0] < src[1].i[0] ? src[0].i[0] : src[1].i[0];
2196   dst->i[1] = src[0].i[1] < src[1].i[1] ? src[0].i[1] : src[1].i[1];
2197   dst->i[2] = src[0].i[2] < src[1].i[2] ? src[0].i[2] : src[1].i[2];
2198   dst->i[3] = src[0].i[3] < src[1].i[3] ? src[0].i[3] : src[1].i[3];
2199}
2200
2201static void
2202micro_isge(union tgsi_exec_channel *dst,
2203           const union tgsi_exec_channel *src)
2204{
2205   dst->i[0] = src[0].i[0] >= src[1].i[0] ? -1 : 0;
2206   dst->i[1] = src[0].i[1] >= src[1].i[1] ? -1 : 0;
2207   dst->i[2] = src[0].i[2] >= src[1].i[2] ? -1 : 0;
2208   dst->i[3] = src[0].i[3] >= src[1].i[3] ? -1 : 0;
2209}
2210
2211static void
2212micro_ishr(union tgsi_exec_channel *dst,
2213           const union tgsi_exec_channel *src)
2214{
2215   dst->i[0] = src[0].i[0] >> src[1].i[0];
2216   dst->i[1] = src[0].i[1] >> src[1].i[1];
2217   dst->i[2] = src[0].i[2] >> src[1].i[2];
2218   dst->i[3] = src[0].i[3] >> src[1].i[3];
2219}
2220
2221static void
2222micro_islt(union tgsi_exec_channel *dst,
2223           const union tgsi_exec_channel *src)
2224{
2225   dst->i[0] = src[0].i[0] < src[1].i[0] ? -1 : 0;
2226   dst->i[1] = src[0].i[1] < src[1].i[1] ? -1 : 0;
2227   dst->i[2] = src[0].i[2] < src[1].i[2] ? -1 : 0;
2228   dst->i[3] = src[0].i[3] < src[1].i[3] ? -1 : 0;
2229}
2230
2231static void
2232micro_f2u(union tgsi_exec_channel *dst,
2233          const union tgsi_exec_channel *src)
2234{
2235   dst->u[0] = (uint)src->f[0];
2236   dst->u[1] = (uint)src->f[1];
2237   dst->u[2] = (uint)src->f[2];
2238   dst->u[3] = (uint)src->f[3];
2239}
2240
2241static void
2242micro_u2f(union tgsi_exec_channel *dst,
2243          const union tgsi_exec_channel *src)
2244{
2245   dst->f[0] = (float)src->u[0];
2246   dst->f[1] = (float)src->u[1];
2247   dst->f[2] = (float)src->u[2];
2248   dst->f[3] = (float)src->u[3];
2249}
2250
2251static void
2252micro_uadd(union tgsi_exec_channel *dst,
2253           const union tgsi_exec_channel *src)
2254{
2255   dst->u[0] = src[0].u[0] + src[1].u[0];
2256   dst->u[1] = src[0].u[1] + src[1].u[1];
2257   dst->u[2] = src[0].u[2] + src[1].u[2];
2258   dst->u[3] = src[0].u[3] + src[1].u[3];
2259}
2260
2261static void
2262micro_udiv(union tgsi_exec_channel *dst,
2263           const union tgsi_exec_channel *src)
2264{
2265   dst->u[0] = src[0].u[0] / src[1].u[0];
2266   dst->u[1] = src[0].u[1] / src[1].u[1];
2267   dst->u[2] = src[0].u[2] / src[1].u[2];
2268   dst->u[3] = src[0].u[3] / src[1].u[3];
2269}
2270
2271static void
2272micro_umad(union tgsi_exec_channel *dst,
2273           const union tgsi_exec_channel *src)
2274{
2275   dst->u[0] = src[0].u[0] * src[1].u[0] + src[2].u[0];
2276   dst->u[1] = src[0].u[1] * src[1].u[1] + src[2].u[1];
2277   dst->u[2] = src[0].u[2] * src[1].u[2] + src[2].u[2];
2278   dst->u[3] = src[0].u[3] * src[1].u[3] + src[2].u[3];
2279}
2280
2281static void
2282micro_umax(union tgsi_exec_channel *dst,
2283           const union tgsi_exec_channel *src)
2284{
2285   dst->u[0] = src[0].u[0] > src[1].u[0] ? src[0].u[0] : src[1].u[0];
2286   dst->u[1] = src[0].u[1] > src[1].u[1] ? src[0].u[1] : src[1].u[1];
2287   dst->u[2] = src[0].u[2] > src[1].u[2] ? src[0].u[2] : src[1].u[2];
2288   dst->u[3] = src[0].u[3] > src[1].u[3] ? src[0].u[3] : src[1].u[3];
2289}
2290
2291static void
2292micro_umin(union tgsi_exec_channel *dst,
2293           const union tgsi_exec_channel *src)
2294{
2295   dst->u[0] = src[0].u[0] < src[1].u[0] ? src[0].u[0] : src[1].u[0];
2296   dst->u[1] = src[0].u[1] < src[1].u[1] ? src[0].u[1] : src[1].u[1];
2297   dst->u[2] = src[0].u[2] < src[1].u[2] ? src[0].u[2] : src[1].u[2];
2298   dst->u[3] = src[0].u[3] < src[1].u[3] ? src[0].u[3] : src[1].u[3];
2299}
2300
2301static void
2302micro_umod(union tgsi_exec_channel *dst,
2303           const union tgsi_exec_channel *src)
2304{
2305   dst->u[0] = src[0].u[0] % src[1].u[0];
2306   dst->u[1] = src[0].u[1] % src[1].u[1];
2307   dst->u[2] = src[0].u[2] % src[1].u[2];
2308   dst->u[3] = src[0].u[3] % src[1].u[3];
2309}
2310
2311static void
2312micro_umul(union tgsi_exec_channel *dst,
2313           const union tgsi_exec_channel *src)
2314{
2315   dst->u[0] = src[0].u[0] * src[1].u[0];
2316   dst->u[1] = src[0].u[1] * src[1].u[1];
2317   dst->u[2] = src[0].u[2] * src[1].u[2];
2318   dst->u[3] = src[0].u[3] * src[1].u[3];
2319}
2320
2321static void
2322micro_useq(union tgsi_exec_channel *dst,
2323           const union tgsi_exec_channel *src)
2324{
2325   dst->u[0] = src[0].u[0] == src[1].u[0] ? ~0 : 0;
2326   dst->u[1] = src[0].u[1] == src[1].u[1] ? ~0 : 0;
2327   dst->u[2] = src[0].u[2] == src[1].u[2] ? ~0 : 0;
2328   dst->u[3] = src[0].u[3] == src[1].u[3] ? ~0 : 0;
2329}
2330
2331static void
2332micro_usge(union tgsi_exec_channel *dst,
2333           const union tgsi_exec_channel *src)
2334{
2335   dst->u[0] = src[0].u[0] >= src[1].u[0] ? ~0 : 0;
2336   dst->u[1] = src[0].u[1] >= src[1].u[1] ? ~0 : 0;
2337   dst->u[2] = src[0].u[2] >= src[1].u[2] ? ~0 : 0;
2338   dst->u[3] = src[0].u[3] >= src[1].u[3] ? ~0 : 0;
2339}
2340
2341static void
2342micro_ushr(union tgsi_exec_channel *dst,
2343           const union tgsi_exec_channel *src)
2344{
2345   dst->u[0] = src[0].u[0] >> src[1].u[0];
2346   dst->u[1] = src[0].u[1] >> src[1].u[1];
2347   dst->u[2] = src[0].u[2] >> src[1].u[2];
2348   dst->u[3] = src[0].u[3] >> src[1].u[3];
2349}
2350
2351static void
2352micro_uslt(union tgsi_exec_channel *dst,
2353           const union tgsi_exec_channel *src)
2354{
2355   dst->u[0] = src[0].u[0] < src[1].u[0] ? ~0 : 0;
2356   dst->u[1] = src[0].u[1] < src[1].u[1] ? ~0 : 0;
2357   dst->u[2] = src[0].u[2] < src[1].u[2] ? ~0 : 0;
2358   dst->u[3] = src[0].u[3] < src[1].u[3] ? ~0 : 0;
2359}
2360
2361static void
2362micro_usne(union tgsi_exec_channel *dst,
2363           const union tgsi_exec_channel *src)
2364{
2365   dst->u[0] = src[0].u[0] != src[1].u[0] ? ~0 : 0;
2366   dst->u[1] = src[0].u[1] != src[1].u[1] ? ~0 : 0;
2367   dst->u[2] = src[0].u[2] != src[1].u[2] ? ~0 : 0;
2368   dst->u[3] = src[0].u[3] != src[1].u[3] ? ~0 : 0;
2369}
2370
2371static void
2372exec_instruction(
2373   struct tgsi_exec_machine *mach,
2374   const struct tgsi_full_instruction *inst,
2375   int *pc )
2376{
2377   uint chan_index;
2378   union tgsi_exec_channel r[10];
2379   union tgsi_exec_channel d[8];
2380
2381   (*pc)++;
2382
2383   switch (inst->Instruction.Opcode) {
2384   case TGSI_OPCODE_ARL:
2385      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
2386      break;
2387
2388   case TGSI_OPCODE_MOV:
2389      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
2390      break;
2391
2392   case TGSI_OPCODE_LIT:
2393      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2394         FETCH( &r[0], 0, CHAN_X );
2395         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2396            micro_max(&d[CHAN_Y], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2397         }
2398
2399         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2400            FETCH( &r[1], 0, CHAN_Y );
2401            micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2402
2403            FETCH( &r[2], 0, CHAN_W );
2404            micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
2405            micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
2406            micro_pow( &r[1], &r[1], &r[2] );
2407            micro_lt(&d[CHAN_Z], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2408         }
2409
2410         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2411            STORE(&d[CHAN_Y], 0, CHAN_Y);
2412         }
2413         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2414            STORE(&d[CHAN_Z], 0, CHAN_Z);
2415         }
2416      }
2417      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2418         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2419      }
2420      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2421         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2422      }
2423      break;
2424
2425   case TGSI_OPCODE_RCP:
2426      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2427      break;
2428
2429   case TGSI_OPCODE_RSQ:
2430      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2431      break;
2432
2433   case TGSI_OPCODE_EXP:
2434      FETCH( &r[0], 0, CHAN_X );
2435      micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
2436      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2437         micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
2438         STORE( &r[2], 0, CHAN_X );        /* store r2 */
2439      }
2440      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2441         micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
2442         STORE( &r[2], 0, CHAN_Y );        /* store r2 */
2443      }
2444      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2445         micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
2446         STORE( &r[2], 0, CHAN_Z );        /* store r2 */
2447      }
2448      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2449         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2450      }
2451      break;
2452
2453   case TGSI_OPCODE_LOG:
2454      FETCH( &r[0], 0, CHAN_X );
2455      micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
2456      micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
2457      micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
2458      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2459         STORE( &r[0], 0, CHAN_X );
2460      }
2461      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2462         micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
2463         micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
2464         STORE( &r[0], 0, CHAN_Y );
2465      }
2466      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2467         STORE( &r[1], 0, CHAN_Z );
2468      }
2469      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2470         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2471      }
2472      break;
2473
2474   case TGSI_OPCODE_MUL:
2475      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2476         FETCH(&r[0], 0, chan_index);
2477         FETCH(&r[1], 1, chan_index);
2478         micro_mul(&d[chan_index], &r[0], &r[1]);
2479      }
2480      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2481         STORE(&d[chan_index], 0, chan_index);
2482      }
2483      break;
2484
2485   case TGSI_OPCODE_ADD:
2486      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2487         FETCH( &r[0], 0, chan_index );
2488         FETCH( &r[1], 1, chan_index );
2489         micro_add(&d[chan_index], &r[0], &r[1]);
2490      }
2491      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2492         STORE(&d[chan_index], 0, chan_index);
2493      }
2494      break;
2495
2496   case TGSI_OPCODE_DP3:
2497      exec_dp3(mach, inst);
2498      break;
2499
2500   case TGSI_OPCODE_DP4:
2501      exec_dp4(mach, inst);
2502      break;
2503
2504   case TGSI_OPCODE_DST:
2505      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2506         FETCH( &r[0], 0, CHAN_Y );
2507         FETCH( &r[1], 1, CHAN_Y);
2508         micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2509      }
2510      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2511         FETCH(&d[CHAN_Z], 0, CHAN_Z);
2512      }
2513      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2514         FETCH(&d[CHAN_W], 1, CHAN_W);
2515      }
2516
2517      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2518         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X);
2519      }
2520      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2521         STORE(&d[CHAN_Y], 0, CHAN_Y);
2522      }
2523      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2524         STORE(&d[CHAN_Z], 0, CHAN_Z);
2525      }
2526      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2527         STORE(&d[CHAN_W], 0, CHAN_W);
2528      }
2529      break;
2530
2531   case TGSI_OPCODE_MIN:
2532      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2533         FETCH(&r[0], 0, chan_index);
2534         FETCH(&r[1], 1, chan_index);
2535
2536         /* XXX use micro_min()?? */
2537         micro_lt(&d[chan_index], &r[0], &r[1], &r[0], &r[1]);
2538      }
2539      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2540         STORE(&d[chan_index], 0, chan_index);
2541      }
2542      break;
2543
2544   case TGSI_OPCODE_MAX:
2545      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2546         FETCH(&r[0], 0, chan_index);
2547         FETCH(&r[1], 1, chan_index);
2548
2549         /* XXX use micro_max()?? */
2550         micro_lt(&d[chan_index], &r[0], &r[1], &r[1], &r[0] );
2551      }
2552      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2553         STORE(&d[chan_index], 0, chan_index);
2554      }
2555      break;
2556
2557   case TGSI_OPCODE_SLT:
2558      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2559      break;
2560
2561   case TGSI_OPCODE_SGE:
2562      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2563      break;
2564
2565   case TGSI_OPCODE_MAD:
2566      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2567      break;
2568
2569   case TGSI_OPCODE_SUB:
2570      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2571         FETCH(&r[0], 0, chan_index);
2572         FETCH(&r[1], 1, chan_index);
2573         micro_sub(&d[chan_index], &r[0], &r[1]);
2574      }
2575      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2576         STORE(&d[chan_index], 0, chan_index);
2577      }
2578      break;
2579
2580   case TGSI_OPCODE_LRP:
2581      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2582      break;
2583
2584   case TGSI_OPCODE_CND:
2585      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2586         FETCH(&r[0], 0, chan_index);
2587         FETCH(&r[1], 1, chan_index);
2588         FETCH(&r[2], 2, chan_index);
2589         micro_lt(&d[chan_index], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]);
2590      }
2591      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2592         STORE(&d[chan_index], 0, chan_index);
2593      }
2594      break;
2595
2596   case TGSI_OPCODE_DP2A:
2597      exec_dp2a(mach, inst);
2598      break;
2599
2600   case TGSI_OPCODE_FRC:
2601      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2602      break;
2603
2604   case TGSI_OPCODE_CLAMP:
2605      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2606         FETCH(&r[0], 0, chan_index);
2607         FETCH(&r[1], 1, chan_index);
2608         micro_max(&r[0], &r[0], &r[1]);
2609         FETCH(&r[1], 2, chan_index);
2610         micro_min(&d[chan_index], &r[0], &r[1]);
2611      }
2612      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2613         STORE(&d[chan_index], 0, chan_index);
2614      }
2615      break;
2616
2617   case TGSI_OPCODE_FLR:
2618      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2619      break;
2620
2621   case TGSI_OPCODE_ROUND:
2622      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2623      break;
2624
2625   case TGSI_OPCODE_EX2:
2626      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2627      break;
2628
2629   case TGSI_OPCODE_LG2:
2630      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2631      break;
2632
2633   case TGSI_OPCODE_POW:
2634      FETCH(&r[0], 0, CHAN_X);
2635      FETCH(&r[1], 1, CHAN_X);
2636
2637      micro_pow( &r[0], &r[0], &r[1] );
2638
2639      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2640         STORE( &r[0], 0, chan_index );
2641      }
2642      break;
2643
2644   case TGSI_OPCODE_XPD:
2645      FETCH(&r[0], 0, CHAN_Y);
2646      FETCH(&r[1], 1, CHAN_Z);
2647
2648      micro_mul( &r[2], &r[0], &r[1] );
2649
2650      FETCH(&r[3], 0, CHAN_Z);
2651      FETCH(&r[4], 1, CHAN_Y);
2652
2653      micro_mul( &r[5], &r[3], &r[4] );
2654      micro_sub(&d[CHAN_X], &r[2], &r[5]);
2655
2656      FETCH(&r[2], 1, CHAN_X);
2657
2658      micro_mul( &r[3], &r[3], &r[2] );
2659
2660      FETCH(&r[5], 0, CHAN_X);
2661
2662      micro_mul( &r[1], &r[1], &r[5] );
2663      micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2664
2665      micro_mul( &r[5], &r[5], &r[4] );
2666      micro_mul( &r[0], &r[0], &r[2] );
2667      micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2668
2669      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2670         STORE(&d[CHAN_X], 0, CHAN_X);
2671      }
2672      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2673         STORE(&d[CHAN_Y], 0, CHAN_Y);
2674      }
2675      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2676         STORE(&d[CHAN_Z], 0, CHAN_Z);
2677      }
2678      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2679         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2680      }
2681      break;
2682
2683   case TGSI_OPCODE_ABS:
2684      exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2685      break;
2686
2687   case TGSI_OPCODE_RCC:
2688      FETCH(&r[0], 0, CHAN_X);
2689      micro_div(&r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0]);
2690      micro_float_clamp(&r[0], &r[0]);
2691      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2692         STORE(&r[0], 0, chan_index);
2693      }
2694      break;
2695
2696   case TGSI_OPCODE_DPH:
2697      exec_dph(mach, inst);
2698      break;
2699
2700   case TGSI_OPCODE_COS:
2701      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2702      break;
2703
2704   case TGSI_OPCODE_DDX:
2705      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2706      break;
2707
2708   case TGSI_OPCODE_DDY:
2709      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2710      break;
2711
2712   case TGSI_OPCODE_KILP:
2713      exec_kilp (mach, inst);
2714      break;
2715
2716   case TGSI_OPCODE_KIL:
2717      exec_kil (mach, inst);
2718      break;
2719
2720   case TGSI_OPCODE_PK2H:
2721      assert (0);
2722      break;
2723
2724   case TGSI_OPCODE_PK2US:
2725      assert (0);
2726      break;
2727
2728   case TGSI_OPCODE_PK4B:
2729      assert (0);
2730      break;
2731
2732   case TGSI_OPCODE_PK4UB:
2733      assert (0);
2734      break;
2735
2736   case TGSI_OPCODE_RFL:
2737      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2738          IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2739          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2740         /* r0 = dp3(src0, src0) */
2741         FETCH(&r[2], 0, CHAN_X);
2742         micro_mul(&r[0], &r[2], &r[2]);
2743         FETCH(&r[4], 0, CHAN_Y);
2744         micro_mul(&r[8], &r[4], &r[4]);
2745         micro_add(&r[0], &r[0], &r[8]);
2746         FETCH(&r[6], 0, CHAN_Z);
2747         micro_mul(&r[8], &r[6], &r[6]);
2748         micro_add(&r[0], &r[0], &r[8]);
2749
2750         /* r1 = dp3(src0, src1) */
2751         FETCH(&r[3], 1, CHAN_X);
2752         micro_mul(&r[1], &r[2], &r[3]);
2753         FETCH(&r[5], 1, CHAN_Y);
2754         micro_mul(&r[8], &r[4], &r[5]);
2755         micro_add(&r[1], &r[1], &r[8]);
2756         FETCH(&r[7], 1, CHAN_Z);
2757         micro_mul(&r[8], &r[6], &r[7]);
2758         micro_add(&r[1], &r[1], &r[8]);
2759
2760         /* r1 = 2 * r1 / r0 */
2761         micro_add(&r[1], &r[1], &r[1]);
2762         micro_div(&r[1], &r[1], &r[0]);
2763
2764         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2765            micro_mul(&r[2], &r[2], &r[1]);
2766            micro_sub(&r[2], &r[2], &r[3]);
2767            STORE(&r[2], 0, CHAN_X);
2768         }
2769         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2770            micro_mul(&r[4], &r[4], &r[1]);
2771            micro_sub(&r[4], &r[4], &r[5]);
2772            STORE(&r[4], 0, CHAN_Y);
2773         }
2774         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2775            micro_mul(&r[6], &r[6], &r[1]);
2776            micro_sub(&r[6], &r[6], &r[7]);
2777            STORE(&r[6], 0, CHAN_Z);
2778         }
2779      }
2780      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2781         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2782      }
2783      break;
2784
2785   case TGSI_OPCODE_SEQ:
2786      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2787      break;
2788
2789   case TGSI_OPCODE_SFL:
2790      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2791         STORE(&mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, chan_index);
2792      }
2793      break;
2794
2795   case TGSI_OPCODE_SGT:
2796      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2797      break;
2798
2799   case TGSI_OPCODE_SIN:
2800      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2801      break;
2802
2803   case TGSI_OPCODE_SLE:
2804      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2805      break;
2806
2807   case TGSI_OPCODE_SNE:
2808      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2809      break;
2810
2811   case TGSI_OPCODE_STR:
2812      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2813         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, chan_index);
2814      }
2815      break;
2816
2817   case TGSI_OPCODE_TEX:
2818      /* simple texture lookup */
2819      /* src[0] = texcoord */
2820      /* src[1] = sampler unit */
2821      exec_tex(mach, inst, TEX_MODIFIER_NONE);
2822      break;
2823
2824   case TGSI_OPCODE_TXB:
2825      /* Texture lookup with lod bias */
2826      /* src[0] = texcoord (src[0].w = LOD bias) */
2827      /* src[1] = sampler unit */
2828      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
2829      break;
2830
2831   case TGSI_OPCODE_TXD:
2832      /* Texture lookup with explict partial derivatives */
2833      /* src[0] = texcoord */
2834      /* src[1] = d[strq]/dx */
2835      /* src[2] = d[strq]/dy */
2836      /* src[3] = sampler unit */
2837      exec_txd(mach, inst);
2838      break;
2839
2840   case TGSI_OPCODE_TXL:
2841      /* Texture lookup with explit LOD */
2842      /* src[0] = texcoord (src[0].w = LOD) */
2843      /* src[1] = sampler unit */
2844      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
2845      break;
2846
2847   case TGSI_OPCODE_TXP:
2848      /* Texture lookup with projection */
2849      /* src[0] = texcoord (src[0].w = projection) */
2850      /* src[1] = sampler unit */
2851      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
2852      break;
2853
2854   case TGSI_OPCODE_UP2H:
2855      assert (0);
2856      break;
2857
2858   case TGSI_OPCODE_UP2US:
2859      assert (0);
2860      break;
2861
2862   case TGSI_OPCODE_UP4B:
2863      assert (0);
2864      break;
2865
2866   case TGSI_OPCODE_UP4UB:
2867      assert (0);
2868      break;
2869
2870   case TGSI_OPCODE_X2D:
2871      FETCH(&r[0], 1, CHAN_X);
2872      FETCH(&r[1], 1, CHAN_Y);
2873      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2874          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2875         FETCH(&r[2], 2, CHAN_X);
2876         micro_mul(&r[2], &r[2], &r[0]);
2877         FETCH(&r[3], 2, CHAN_Y);
2878         micro_mul(&r[3], &r[3], &r[1]);
2879         micro_add(&r[2], &r[2], &r[3]);
2880         FETCH(&r[3], 0, CHAN_X);
2881         micro_add(&d[CHAN_X], &r[2], &r[3]);
2882
2883      }
2884      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2885          IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2886         FETCH(&r[2], 2, CHAN_Z);
2887         micro_mul(&r[2], &r[2], &r[0]);
2888         FETCH(&r[3], 2, CHAN_W);
2889         micro_mul(&r[3], &r[3], &r[1]);
2890         micro_add(&r[2], &r[2], &r[3]);
2891         FETCH(&r[3], 0, CHAN_Y);
2892         micro_add(&d[CHAN_Y], &r[2], &r[3]);
2893
2894      }
2895      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2896         STORE(&d[CHAN_X], 0, CHAN_X);
2897      }
2898      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2899         STORE(&d[CHAN_Y], 0, CHAN_Y);
2900      }
2901      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2902         STORE(&d[CHAN_X], 0, CHAN_Z);
2903      }
2904      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2905         STORE(&d[CHAN_Y], 0, CHAN_W);
2906      }
2907      break;
2908
2909   case TGSI_OPCODE_ARA:
2910      assert (0);
2911      break;
2912
2913   case TGSI_OPCODE_ARR:
2914      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
2915      break;
2916
2917   case TGSI_OPCODE_BRA:
2918      assert (0);
2919      break;
2920
2921   case TGSI_OPCODE_CAL:
2922      /* skip the call if no execution channels are enabled */
2923      if (mach->ExecMask) {
2924         /* do the call */
2925
2926         /* First, record the depths of the execution stacks.
2927          * This is important for deeply nested/looped return statements.
2928          * We have to unwind the stacks by the correct amount.  For a
2929          * real code generator, we could determine the number of entries
2930          * to pop off each stack with simple static analysis and avoid
2931          * implementing this data structure at run time.
2932          */
2933         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
2934         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
2935         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
2936         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
2937         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
2938         /* note that PC was already incremented above */
2939         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
2940
2941         mach->CallStackTop++;
2942
2943         /* Second, push the Cond, Loop, Cont, Func stacks */
2944         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2945         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2946         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2947         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
2948         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
2949         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
2950
2951         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2952         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2953         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2954         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
2955         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
2956         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
2957
2958         /* Finally, jump to the subroutine */
2959         *pc = inst->Label.Label;
2960      }
2961      break;
2962
2963   case TGSI_OPCODE_RET:
2964      mach->FuncMask &= ~mach->ExecMask;
2965      UPDATE_EXEC_MASK(mach);
2966
2967      if (mach->FuncMask == 0x0) {
2968         /* really return now (otherwise, keep executing */
2969
2970         if (mach->CallStackTop == 0) {
2971            /* returning from main() */
2972            *pc = -1;
2973            return;
2974         }
2975
2976         assert(mach->CallStackTop > 0);
2977         mach->CallStackTop--;
2978
2979         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
2980         mach->CondMask = mach->CondStack[mach->CondStackTop];
2981
2982         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
2983         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
2984
2985         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
2986         mach->ContMask = mach->ContStack[mach->ContStackTop];
2987
2988         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
2989         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
2990
2991         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
2992         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
2993
2994         assert(mach->FuncStackTop > 0);
2995         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
2996
2997         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
2998
2999         UPDATE_EXEC_MASK(mach);
3000      }
3001      break;
3002
3003   case TGSI_OPCODE_SSG:
3004      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3005      break;
3006
3007   case TGSI_OPCODE_CMP:
3008      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3009         FETCH(&r[0], 0, chan_index);
3010         FETCH(&r[1], 1, chan_index);
3011         FETCH(&r[2], 2, chan_index);
3012         micro_lt(&d[chan_index], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2]);
3013      }
3014      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3015         STORE(&d[chan_index], 0, chan_index);
3016      }
3017      break;
3018
3019   case TGSI_OPCODE_SCS:
3020      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
3021         FETCH( &r[0], 0, CHAN_X );
3022         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
3023            micro_cos(&r[1], &r[0]);
3024            STORE(&r[1], 0, CHAN_X);
3025         }
3026         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
3027            micro_sin(&r[1], &r[0]);
3028            STORE(&r[1], 0, CHAN_Y);
3029         }
3030      }
3031      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
3032         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
3033      }
3034      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
3035         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
3036      }
3037      break;
3038
3039   case TGSI_OPCODE_NRM:
3040      /* 3-component vector normalize */
3041      if(IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
3042         IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
3043         IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
3044         /* r3 = sqrt(dp3(src0, src0)) */
3045         FETCH(&r[0], 0, CHAN_X);
3046         micro_mul(&r[3], &r[0], &r[0]);
3047         FETCH(&r[1], 0, CHAN_Y);
3048         micro_mul(&r[4], &r[1], &r[1]);
3049         micro_add(&r[3], &r[3], &r[4]);
3050         FETCH(&r[2], 0, CHAN_Z);
3051         micro_mul(&r[4], &r[2], &r[2]);
3052         micro_add(&r[3], &r[3], &r[4]);
3053         micro_sqrt(&r[3], &r[3]);
3054
3055         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
3056            micro_div(&r[0], &r[0], &r[3]);
3057            STORE(&r[0], 0, CHAN_X);
3058         }
3059         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
3060            micro_div(&r[1], &r[1], &r[3]);
3061            STORE(&r[1], 0, CHAN_Y);
3062         }
3063         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
3064            micro_div(&r[2], &r[2], &r[3]);
3065            STORE(&r[2], 0, CHAN_Z);
3066         }
3067      }
3068      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
3069         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
3070      }
3071      break;
3072
3073   case TGSI_OPCODE_NRM4:
3074      /* 4-component vector normalize */
3075      {
3076         union tgsi_exec_channel tmp, dot;
3077
3078         /* tmp = dp4(src0, src0): */
3079         FETCH( &r[0], 0, CHAN_X );
3080         micro_mul( &tmp, &r[0], &r[0] );
3081
3082         FETCH( &r[1], 0, CHAN_Y );
3083         micro_mul( &dot, &r[1], &r[1] );
3084         micro_add( &tmp, &tmp, &dot );
3085
3086         FETCH( &r[2], 0, CHAN_Z );
3087         micro_mul( &dot, &r[2], &r[2] );
3088         micro_add( &tmp, &tmp, &dot );
3089
3090         FETCH( &r[3], 0, CHAN_W );
3091         micro_mul( &dot, &r[3], &r[3] );
3092         micro_add( &tmp, &tmp, &dot );
3093
3094         /* tmp = 1 / sqrt(tmp) */
3095         micro_sqrt( &tmp, &tmp );
3096         micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
3097
3098         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3099            /* chan = chan * tmp */
3100            micro_mul( &r[chan_index], &tmp, &r[chan_index] );
3101            STORE( &r[chan_index], 0, chan_index );
3102         }
3103      }
3104      break;
3105
3106   case TGSI_OPCODE_DIV:
3107      assert( 0 );
3108      break;
3109
3110   case TGSI_OPCODE_DP2:
3111      exec_dp2(mach, inst);
3112      break;
3113
3114   case TGSI_OPCODE_IF:
3115      /* push CondMask */
3116      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3117      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3118      FETCH( &r[0], 0, CHAN_X );
3119      /* update CondMask */
3120      if( ! r[0].u[0] ) {
3121         mach->CondMask &= ~0x1;
3122      }
3123      if( ! r[0].u[1] ) {
3124         mach->CondMask &= ~0x2;
3125      }
3126      if( ! r[0].u[2] ) {
3127         mach->CondMask &= ~0x4;
3128      }
3129      if( ! r[0].u[3] ) {
3130         mach->CondMask &= ~0x8;
3131      }
3132      UPDATE_EXEC_MASK(mach);
3133      /* Todo: If CondMask==0, jump to ELSE */
3134      break;
3135
3136   case TGSI_OPCODE_ELSE:
3137      /* invert CondMask wrt previous mask */
3138      {
3139         uint prevMask;
3140         assert(mach->CondStackTop > 0);
3141         prevMask = mach->CondStack[mach->CondStackTop - 1];
3142         mach->CondMask = ~mach->CondMask & prevMask;
3143         UPDATE_EXEC_MASK(mach);
3144         /* Todo: If CondMask==0, jump to ENDIF */
3145      }
3146      break;
3147
3148   case TGSI_OPCODE_ENDIF:
3149      /* pop CondMask */
3150      assert(mach->CondStackTop > 0);
3151      mach->CondMask = mach->CondStack[--mach->CondStackTop];
3152      UPDATE_EXEC_MASK(mach);
3153      break;
3154
3155   case TGSI_OPCODE_END:
3156      /* halt execution */
3157      *pc = -1;
3158      break;
3159
3160   case TGSI_OPCODE_REP:
3161      assert (0);
3162      break;
3163
3164   case TGSI_OPCODE_ENDREP:
3165       assert (0);
3166       break;
3167
3168   case TGSI_OPCODE_PUSHA:
3169      assert (0);
3170      break;
3171
3172   case TGSI_OPCODE_POPA:
3173      assert (0);
3174      break;
3175
3176   case TGSI_OPCODE_CEIL:
3177      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3178      break;
3179
3180   case TGSI_OPCODE_I2F:
3181      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
3182      break;
3183
3184   case TGSI_OPCODE_NOT:
3185      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3186      break;
3187
3188   case TGSI_OPCODE_TRUNC:
3189      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3190      break;
3191
3192   case TGSI_OPCODE_SHL:
3193      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3194      break;
3195
3196   case TGSI_OPCODE_AND:
3197      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3198      break;
3199
3200   case TGSI_OPCODE_OR:
3201      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3202      break;
3203
3204   case TGSI_OPCODE_MOD:
3205      assert (0);
3206      break;
3207
3208   case TGSI_OPCODE_XOR:
3209      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3210      break;
3211
3212   case TGSI_OPCODE_SAD:
3213      assert (0);
3214      break;
3215
3216   case TGSI_OPCODE_TXF:
3217      assert (0);
3218      break;
3219
3220   case TGSI_OPCODE_TXQ:
3221      assert (0);
3222      break;
3223
3224   case TGSI_OPCODE_EMIT:
3225      emit_vertex(mach);
3226      break;
3227
3228   case TGSI_OPCODE_ENDPRIM:
3229      emit_primitive(mach);
3230      break;
3231
3232   case TGSI_OPCODE_BGNFOR:
3233      assert(mach->LoopCounterStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3234      for (chan_index = 0; chan_index < 3; chan_index++) {
3235         FETCH( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[chan_index], 0, chan_index );
3236      }
3237      ++mach->LoopCounterStackTop;
3238      STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X], 0, CHAN_X);
3239      /* update LoopMask */
3240      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3241         mach->LoopMask &= ~0x1;
3242      }
3243      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3244         mach->LoopMask &= ~0x2;
3245      }
3246      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3247         mach->LoopMask &= ~0x4;
3248      }
3249      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3250         mach->LoopMask &= ~0x8;
3251      }
3252      /* TODO: if mach->LoopMask == 0, jump to end of loop */
3253      UPDATE_EXEC_MASK(mach);
3254      /* fall-through (for now) */
3255   case TGSI_OPCODE_BGNLOOP:
3256      /* push LoopMask and ContMasks */
3257      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3258      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3259      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3260      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3261
3262      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3263      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3264      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3265      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3266      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
3267      break;
3268
3269   case TGSI_OPCODE_ENDFOR:
3270      assert(mach->LoopCounterStackTop > 0);
3271      micro_sub(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3272                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3273                &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
3274      /* update LoopMask */
3275      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3276         mach->LoopMask &= ~0x1;
3277      }
3278      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3279         mach->LoopMask &= ~0x2;
3280      }
3281      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3282         mach->LoopMask &= ~0x4;
3283      }
3284      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3285         mach->LoopMask &= ~0x8;
3286      }
3287      micro_add(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3288                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3289                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Z]);
3290      assert(mach->LoopLabelStackTop > 0);
3291      inst = mach->Instructions + mach->LoopLabelStack[mach->LoopLabelStackTop - 1];
3292      STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_X], 0, CHAN_X);
3293      /* Restore ContMask, but don't pop */
3294      assert(mach->ContStackTop > 0);
3295      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3296      UPDATE_EXEC_MASK(mach);
3297      if (mach->ExecMask) {
3298         /* repeat loop: jump to instruction just past BGNLOOP */
3299         assert(mach->LoopLabelStackTop > 0);
3300         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3301      }
3302      else {
3303         /* exit loop: pop LoopMask */
3304         assert(mach->LoopStackTop > 0);
3305         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3306         /* pop ContMask */
3307         assert(mach->ContStackTop > 0);
3308         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3309         assert(mach->LoopLabelStackTop > 0);
3310         --mach->LoopLabelStackTop;
3311         assert(mach->LoopCounterStackTop > 0);
3312         --mach->LoopCounterStackTop;
3313
3314         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3315      }
3316      UPDATE_EXEC_MASK(mach);
3317      break;
3318
3319   case TGSI_OPCODE_ENDLOOP:
3320      /* Restore ContMask, but don't pop */
3321      assert(mach->ContStackTop > 0);
3322      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3323      UPDATE_EXEC_MASK(mach);
3324      if (mach->ExecMask) {
3325         /* repeat loop: jump to instruction just past BGNLOOP */
3326         assert(mach->LoopLabelStackTop > 0);
3327         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3328      }
3329      else {
3330         /* exit loop: pop LoopMask */
3331         assert(mach->LoopStackTop > 0);
3332         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3333         /* pop ContMask */
3334         assert(mach->ContStackTop > 0);
3335         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3336         assert(mach->LoopLabelStackTop > 0);
3337         --mach->LoopLabelStackTop;
3338
3339         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3340      }
3341      UPDATE_EXEC_MASK(mach);
3342      break;
3343
3344   case TGSI_OPCODE_BRK:
3345      exec_break(mach);
3346      break;
3347
3348   case TGSI_OPCODE_CONT:
3349      /* turn off cont channels for each enabled exec channel */
3350      mach->ContMask &= ~mach->ExecMask;
3351      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3352      UPDATE_EXEC_MASK(mach);
3353      break;
3354
3355   case TGSI_OPCODE_BGNSUB:
3356      /* no-op */
3357      break;
3358
3359   case TGSI_OPCODE_ENDSUB:
3360      /*
3361       * XXX: This really should be a no-op. We should never reach this opcode.
3362       */
3363
3364      assert(mach->CallStackTop > 0);
3365      mach->CallStackTop--;
3366
3367      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3368      mach->CondMask = mach->CondStack[mach->CondStackTop];
3369
3370      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3371      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3372
3373      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3374      mach->ContMask = mach->ContStack[mach->ContStackTop];
3375
3376      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3377      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3378
3379      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3380      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3381
3382      assert(mach->FuncStackTop > 0);
3383      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3384
3385      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3386
3387      UPDATE_EXEC_MASK(mach);
3388      break;
3389
3390   case TGSI_OPCODE_NOP:
3391      break;
3392
3393   case TGSI_OPCODE_BREAKC:
3394      FETCH(&r[0], 0, CHAN_X);
3395      /* update CondMask */
3396      if (r[0].u[0] && (mach->ExecMask & 0x1)) {
3397         mach->LoopMask &= ~0x1;
3398      }
3399      if (r[0].u[1] && (mach->ExecMask & 0x2)) {
3400         mach->LoopMask &= ~0x2;
3401      }
3402      if (r[0].u[2] && (mach->ExecMask & 0x4)) {
3403         mach->LoopMask &= ~0x4;
3404      }
3405      if (r[0].u[3] && (mach->ExecMask & 0x8)) {
3406         mach->LoopMask &= ~0x8;
3407      }
3408      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3409      UPDATE_EXEC_MASK(mach);
3410      break;
3411
3412   case TGSI_OPCODE_F2I:
3413      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3414      break;
3415
3416   case TGSI_OPCODE_IDIV:
3417      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3418      break;
3419
3420   case TGSI_OPCODE_IMAX:
3421      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3422      break;
3423
3424   case TGSI_OPCODE_IMIN:
3425      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3426      break;
3427
3428   case TGSI_OPCODE_INEG:
3429      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3430      break;
3431
3432   case TGSI_OPCODE_ISGE:
3433      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3434      break;
3435
3436   case TGSI_OPCODE_ISHR:
3437      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3438      break;
3439
3440   case TGSI_OPCODE_ISLT:
3441      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3442      break;
3443
3444   case TGSI_OPCODE_F2U:
3445      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3446      break;
3447
3448   case TGSI_OPCODE_U2F:
3449      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
3450      break;
3451
3452   case TGSI_OPCODE_UADD:
3453      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3454      break;
3455
3456   case TGSI_OPCODE_UDIV:
3457      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3458      break;
3459
3460   case TGSI_OPCODE_UMAD:
3461      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3462      break;
3463
3464   case TGSI_OPCODE_UMAX:
3465      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3466      break;
3467
3468   case TGSI_OPCODE_UMIN:
3469      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3470      break;
3471
3472   case TGSI_OPCODE_UMOD:
3473      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3474      break;
3475
3476   case TGSI_OPCODE_UMUL:
3477      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3478      break;
3479
3480   case TGSI_OPCODE_USEQ:
3481      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3482      break;
3483
3484   case TGSI_OPCODE_USGE:
3485      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3486      break;
3487
3488   case TGSI_OPCODE_USHR:
3489      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3490      break;
3491
3492   case TGSI_OPCODE_USLT:
3493      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3494      break;
3495
3496   case TGSI_OPCODE_USNE:
3497      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3498      break;
3499
3500   case TGSI_OPCODE_SWITCH:
3501      exec_switch(mach, inst);
3502      break;
3503
3504   case TGSI_OPCODE_CASE:
3505      exec_case(mach, inst);
3506      break;
3507
3508   case TGSI_OPCODE_DEFAULT:
3509      exec_default(mach);
3510      break;
3511
3512   case TGSI_OPCODE_ENDSWITCH:
3513      exec_endswitch(mach);
3514      break;
3515
3516   default:
3517      assert( 0 );
3518   }
3519}
3520
3521
3522#define DEBUG_EXECUTION 0
3523
3524
3525/**
3526 * Run TGSI interpreter.
3527 * \return bitmask of "alive" quad components
3528 */
3529uint
3530tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
3531{
3532   uint i;
3533   int pc = 0;
3534
3535   mach->CondMask = 0xf;
3536   mach->LoopMask = 0xf;
3537   mach->ContMask = 0xf;
3538   mach->FuncMask = 0xf;
3539   mach->ExecMask = 0xf;
3540
3541   mach->Switch.mask = 0xf;
3542
3543   assert(mach->CondStackTop == 0);
3544   assert(mach->LoopStackTop == 0);
3545   assert(mach->ContStackTop == 0);
3546   assert(mach->SwitchStackTop == 0);
3547   assert(mach->BreakStackTop == 0);
3548   assert(mach->CallStackTop == 0);
3549
3550   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
3551   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
3552
3553   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
3554      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
3555      mach->Primitives[0] = 0;
3556   }
3557
3558   for (i = 0; i < QUAD_SIZE; i++) {
3559      mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
3560         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
3561         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
3562         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
3563         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
3564   }
3565
3566   /* execute declarations (interpolants) */
3567   for (i = 0; i < mach->NumDeclarations; i++) {
3568      exec_declaration( mach, mach->Declarations+i );
3569   }
3570
3571   {
3572#if DEBUG_EXECUTION
3573      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
3574      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
3575      uint inst = 1;
3576
3577      memcpy(temps, mach->Temps, sizeof(temps));
3578      memcpy(outputs, mach->Outputs, sizeof(outputs));
3579#endif
3580
3581      /* execute instructions, until pc is set to -1 */
3582      while (pc != -1) {
3583
3584#if DEBUG_EXECUTION
3585         uint i;
3586
3587         tgsi_dump_instruction(&mach->Instructions[pc], inst++);
3588#endif
3589
3590         assert(pc < (int) mach->NumInstructions);
3591         exec_instruction(mach, mach->Instructions + pc, &pc);
3592
3593#if DEBUG_EXECUTION
3594         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
3595            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
3596               uint j;
3597
3598               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
3599               debug_printf("TEMP[%2u] = ", i);
3600               for (j = 0; j < 4; j++) {
3601                  if (j > 0) {
3602                     debug_printf("           ");
3603                  }
3604                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3605                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
3606                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
3607                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
3608                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
3609               }
3610            }
3611         }
3612         for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
3613            if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
3614               uint j;
3615
3616               memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
3617               debug_printf("OUT[%2u] =  ", i);
3618               for (j = 0; j < 4; j++) {
3619                  if (j > 0) {
3620                     debug_printf("           ");
3621                  }
3622                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3623                               outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
3624                               outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
3625                               outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
3626                               outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
3627               }
3628            }
3629         }
3630#endif
3631      }
3632   }
3633
3634#if 0
3635   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
3636   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
3637      /*
3638       * Scale back depth component.
3639       */
3640      for (i = 0; i < 4; i++)
3641         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
3642   }
3643#endif
3644
3645   assert(mach->CondStackTop == 0);
3646   assert(mach->LoopStackTop == 0);
3647   assert(mach->ContStackTop == 0);
3648   assert(mach->SwitchStackTop == 0);
3649   assert(mach->BreakStackTop == 0);
3650   assert(mach->CallStackTop == 0);
3651
3652   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3653}
3654