tgsi_exec.c revision f1f7006d1f547571ec300277d7d5eef2007e9de1
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29/**
30 * TGSI interpreter/executor.
31 *
32 * Flow control information:
33 *
34 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
35 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
36 * care since a condition may be true for some quad components but false
37 * for other components.
38 *
39 * We basically execute all statements (even if they're in the part of
40 * an IF/ELSE clause that's "not taken") and use a special mask to
41 * control writing to destination registers.  This is the ExecMask.
42 * See store_dest().
43 *
44 * The ExecMask is computed from three other masks (CondMask, LoopMask and
45 * ContMask) which are controlled by the flow control instructions (namely:
46 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
47 *
48 *
49 * Authors:
50 *   Michal Krol
51 *   Brian Paul
52 */
53
54#include "pipe/p_compiler.h"
55#include "pipe/p_state.h"
56#include "pipe/p_shader_tokens.h"
57#include "tgsi/tgsi_dump.h"
58#include "tgsi/tgsi_parse.h"
59#include "tgsi/tgsi_util.h"
60#include "tgsi_exec.h"
61#include "util/u_memory.h"
62#include "util/u_math.h"
63
64
65#define FAST_MATH 1
66
67#define TILE_TOP_LEFT     0
68#define TILE_TOP_RIGHT    1
69#define TILE_BOTTOM_LEFT  2
70#define TILE_BOTTOM_RIGHT 3
71
72static void
73micro_abs(union tgsi_exec_channel *dst,
74          const union tgsi_exec_channel *src)
75{
76   dst->f[0] = fabsf(src->f[0]);
77   dst->f[1] = fabsf(src->f[1]);
78   dst->f[2] = fabsf(src->f[2]);
79   dst->f[3] = fabsf(src->f[3]);
80}
81
82static void
83micro_arl(union tgsi_exec_channel *dst,
84          const union tgsi_exec_channel *src)
85{
86   dst->i[0] = (int)floorf(src->f[0]);
87   dst->i[1] = (int)floorf(src->f[1]);
88   dst->i[2] = (int)floorf(src->f[2]);
89   dst->i[3] = (int)floorf(src->f[3]);
90}
91
92static void
93micro_arr(union tgsi_exec_channel *dst,
94          const union tgsi_exec_channel *src)
95{
96   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
97   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
98   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
99   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
100}
101
102static void
103micro_ceil(union tgsi_exec_channel *dst,
104           const union tgsi_exec_channel *src)
105{
106   dst->f[0] = ceilf(src->f[0]);
107   dst->f[1] = ceilf(src->f[1]);
108   dst->f[2] = ceilf(src->f[2]);
109   dst->f[3] = ceilf(src->f[3]);
110}
111
112static void
113micro_cos(union tgsi_exec_channel *dst,
114          const union tgsi_exec_channel *src)
115{
116   dst->f[0] = cosf(src->f[0]);
117   dst->f[1] = cosf(src->f[1]);
118   dst->f[2] = cosf(src->f[2]);
119   dst->f[3] = cosf(src->f[3]);
120}
121
122static void
123micro_ddx(union tgsi_exec_channel *dst,
124          const union tgsi_exec_channel *src)
125{
126   dst->f[0] =
127   dst->f[1] =
128   dst->f[2] =
129   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
130}
131
132static void
133micro_ddy(union tgsi_exec_channel *dst,
134          const union tgsi_exec_channel *src)
135{
136   dst->f[0] =
137   dst->f[1] =
138   dst->f[2] =
139   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
140}
141
142static void
143micro_exp2(union tgsi_exec_channel *dst,
144           const union tgsi_exec_channel *src)
145{
146#if FAST_MATH
147   dst->f[0] = util_fast_exp2(src->f[0]);
148   dst->f[1] = util_fast_exp2(src->f[1]);
149   dst->f[2] = util_fast_exp2(src->f[2]);
150   dst->f[3] = util_fast_exp2(src->f[3]);
151#else
152#if DEBUG
153   /* Inf is okay for this instruction, so clamp it to silence assertions. */
154   uint i;
155   union tgsi_exec_channel clamped;
156
157   for (i = 0; i < 4; i++) {
158      if (src->f[i] > 127.99999f) {
159         clamped.f[i] = 127.99999f;
160      } else if (src->f[i] < -126.99999f) {
161         clamped.f[i] = -126.99999f;
162      } else {
163         clamped.f[i] = src->f[i];
164      }
165   }
166   src = &clamped;
167#endif /* DEBUG */
168
169   dst->f[0] = powf(2.0f, src->f[0]);
170   dst->f[1] = powf(2.0f, src->f[1]);
171   dst->f[2] = powf(2.0f, src->f[2]);
172   dst->f[3] = powf(2.0f, src->f[3]);
173#endif /* FAST_MATH */
174}
175
176static void
177micro_flr(union tgsi_exec_channel *dst,
178          const union tgsi_exec_channel *src)
179{
180   dst->f[0] = floorf(src->f[0]);
181   dst->f[1] = floorf(src->f[1]);
182   dst->f[2] = floorf(src->f[2]);
183   dst->f[3] = floorf(src->f[3]);
184}
185
186static void
187micro_frc(union tgsi_exec_channel *dst,
188          const union tgsi_exec_channel *src)
189{
190   dst->f[0] = src->f[0] - floorf(src->f[0]);
191   dst->f[1] = src->f[1] - floorf(src->f[1]);
192   dst->f[2] = src->f[2] - floorf(src->f[2]);
193   dst->f[3] = src->f[3] - floorf(src->f[3]);
194}
195
196static void
197micro_iabs(union tgsi_exec_channel *dst,
198           const union tgsi_exec_channel *src)
199{
200   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
201   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
202   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
203   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
204}
205
206static void
207micro_ineg(union tgsi_exec_channel *dst,
208           const union tgsi_exec_channel *src)
209{
210   dst->i[0] = -src->i[0];
211   dst->i[1] = -src->i[1];
212   dst->i[2] = -src->i[2];
213   dst->i[3] = -src->i[3];
214}
215
216static void
217micro_lg2(union tgsi_exec_channel *dst,
218          const union tgsi_exec_channel *src)
219{
220#if FAST_MATH
221   dst->f[0] = util_fast_log2(src->f[0]);
222   dst->f[1] = util_fast_log2(src->f[1]);
223   dst->f[2] = util_fast_log2(src->f[2]);
224   dst->f[3] = util_fast_log2(src->f[3]);
225#else
226   dst->f[0] = logf(src->f[0]) * 1.442695f;
227   dst->f[1] = logf(src->f[1]) * 1.442695f;
228   dst->f[2] = logf(src->f[2]) * 1.442695f;
229   dst->f[3] = logf(src->f[3]) * 1.442695f;
230#endif
231}
232
233static void
234micro_lrp(union tgsi_exec_channel *dst,
235          const union tgsi_exec_channel *src0,
236          const union tgsi_exec_channel *src1,
237          const union tgsi_exec_channel *src2)
238{
239   dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
240   dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
241   dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
242   dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
243}
244
245static void
246micro_mad(union tgsi_exec_channel *dst,
247          const union tgsi_exec_channel *src0,
248          const union tgsi_exec_channel *src1,
249          const union tgsi_exec_channel *src2)
250{
251   dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
252   dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
253   dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
254   dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
255}
256
257static void
258micro_mov(union tgsi_exec_channel *dst,
259          const union tgsi_exec_channel *src)
260{
261   dst->u[0] = src->u[0];
262   dst->u[1] = src->u[1];
263   dst->u[2] = src->u[2];
264   dst->u[3] = src->u[3];
265}
266
267static void
268micro_rcp(union tgsi_exec_channel *dst,
269          const union tgsi_exec_channel *src)
270{
271#if 0 /* for debugging */
272   assert(src->f[0] != 0.0f);
273   assert(src->f[1] != 0.0f);
274   assert(src->f[2] != 0.0f);
275   assert(src->f[3] != 0.0f);
276#endif
277   dst->f[0] = 1.0f / src->f[0];
278   dst->f[1] = 1.0f / src->f[1];
279   dst->f[2] = 1.0f / src->f[2];
280   dst->f[3] = 1.0f / src->f[3];
281}
282
283static void
284micro_rnd(union tgsi_exec_channel *dst,
285          const union tgsi_exec_channel *src)
286{
287   dst->f[0] = floorf(src->f[0] + 0.5f);
288   dst->f[1] = floorf(src->f[1] + 0.5f);
289   dst->f[2] = floorf(src->f[2] + 0.5f);
290   dst->f[3] = floorf(src->f[3] + 0.5f);
291}
292
293static void
294micro_rsq(union tgsi_exec_channel *dst,
295          const union tgsi_exec_channel *src)
296{
297#if 0 /* for debugging */
298   assert(src->f[0] != 0.0f);
299   assert(src->f[1] != 0.0f);
300   assert(src->f[2] != 0.0f);
301   assert(src->f[3] != 0.0f);
302#endif
303   dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
304   dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
305   dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
306   dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
307}
308
309static void
310micro_seq(union tgsi_exec_channel *dst,
311          const union tgsi_exec_channel *src0,
312          const union tgsi_exec_channel *src1)
313{
314   dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
315   dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
316   dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
317   dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
318}
319
320static void
321micro_sge(union tgsi_exec_channel *dst,
322          const union tgsi_exec_channel *src0,
323          const union tgsi_exec_channel *src1)
324{
325   dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
326   dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
327   dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
328   dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
329}
330
331static void
332micro_sgn(union tgsi_exec_channel *dst,
333          const union tgsi_exec_channel *src)
334{
335   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
336   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
337   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
338   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
339}
340
341static void
342micro_sgt(union tgsi_exec_channel *dst,
343          const union tgsi_exec_channel *src0,
344          const union tgsi_exec_channel *src1)
345{
346   dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
347   dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
348   dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
349   dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
350}
351
352static void
353micro_sin(union tgsi_exec_channel *dst,
354          const union tgsi_exec_channel *src)
355{
356   dst->f[0] = sinf(src->f[0]);
357   dst->f[1] = sinf(src->f[1]);
358   dst->f[2] = sinf(src->f[2]);
359   dst->f[3] = sinf(src->f[3]);
360}
361
362static void
363micro_sle(union tgsi_exec_channel *dst,
364          const union tgsi_exec_channel *src0,
365          const union tgsi_exec_channel *src1)
366{
367   dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
368   dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
369   dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
370   dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
371}
372
373static void
374micro_slt(union tgsi_exec_channel *dst,
375          const union tgsi_exec_channel *src0,
376          const union tgsi_exec_channel *src1)
377{
378   dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
379   dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
380   dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
381   dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
382}
383
384static void
385micro_sne(union tgsi_exec_channel *dst,
386          const union tgsi_exec_channel *src0,
387          const union tgsi_exec_channel *src1)
388{
389   dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
390   dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
391   dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
392   dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
393}
394
395static void
396micro_trunc(union tgsi_exec_channel *dst,
397            const union tgsi_exec_channel *src)
398{
399   dst->f[0] = (float)(int)src->f[0];
400   dst->f[1] = (float)(int)src->f[1];
401   dst->f[2] = (float)(int)src->f[2];
402   dst->f[3] = (float)(int)src->f[3];
403}
404
405
406#define CHAN_X  0
407#define CHAN_Y  1
408#define CHAN_Z  2
409#define CHAN_W  3
410
411enum tgsi_exec_datatype {
412   TGSI_EXEC_DATA_FLOAT,
413   TGSI_EXEC_DATA_INT,
414   TGSI_EXEC_DATA_UINT
415};
416
417/*
418 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
419 */
420#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
421#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
422#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
423#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
424#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
425#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
426#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
427#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
428#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
429#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
430#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
431#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
432#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
433#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
434#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
435#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
436#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
437#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
438#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
439#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
440#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
441#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
442#define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
443#define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
444#define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
445#define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
446#define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
447#define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
448#define TEMP_R0            TGSI_EXEC_TEMP_R0
449#define TEMP_P0            TGSI_EXEC_TEMP_P0
450
451#define IS_CHANNEL_ENABLED(INST, CHAN)\
452   ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
453
454#define IS_CHANNEL_ENABLED2(INST, CHAN)\
455   ((INST).Dst[1].Register.WriteMask & (1 << (CHAN)))
456
457#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
458   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
459      if (IS_CHANNEL_ENABLED( INST, CHAN ))
460
461#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
462   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
463      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
464
465
466/** The execution mask depends on the conditional mask and the loop mask */
467#define UPDATE_EXEC_MASK(MACH) \
468      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
469
470
471static const union tgsi_exec_channel ZeroVec =
472   { { 0.0, 0.0, 0.0, 0.0 } };
473
474static const union tgsi_exec_channel OneVec = {
475   {1.0f, 1.0f, 1.0f, 1.0f}
476};
477
478
479/**
480 * Assert that none of the float values in 'chan' are infinite or NaN.
481 * NaN and Inf may occur normally during program execution and should
482 * not lead to crashes, etc.  But when debugging, it's helpful to catch
483 * them.
484 */
485static INLINE void
486check_inf_or_nan(const union tgsi_exec_channel *chan)
487{
488   assert(!util_is_inf_or_nan((chan)->f[0]));
489   assert(!util_is_inf_or_nan((chan)->f[1]));
490   assert(!util_is_inf_or_nan((chan)->f[2]));
491   assert(!util_is_inf_or_nan((chan)->f[3]));
492}
493
494
495#ifdef DEBUG
496static void
497print_chan(const char *msg, const union tgsi_exec_channel *chan)
498{
499   debug_printf("%s = {%f, %f, %f, %f}\n",
500                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
501}
502#endif
503
504
505#ifdef DEBUG
506static void
507print_temp(const struct tgsi_exec_machine *mach, uint index)
508{
509   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
510   int i;
511   debug_printf("Temp[%u] =\n", index);
512   for (i = 0; i < 4; i++) {
513      debug_printf("  %c: { %f, %f, %f, %f }\n",
514                   "XYZW"[i],
515                   tmp->xyzw[i].f[0],
516                   tmp->xyzw[i].f[1],
517                   tmp->xyzw[i].f[2],
518                   tmp->xyzw[i].f[3]);
519   }
520}
521#endif
522
523
524/**
525 * Check if there's a potential src/dst register data dependency when
526 * using SOA execution.
527 * Example:
528 *   MOV T, T.yxwz;
529 * This would expand into:
530 *   MOV t0, t1;
531 *   MOV t1, t0;
532 *   MOV t2, t3;
533 *   MOV t3, t2;
534 * The second instruction will have the wrong value for t0 if executed as-is.
535 */
536boolean
537tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
538{
539   uint i, chan;
540
541   uint writemask = inst->Dst[0].Register.WriteMask;
542   if (writemask == TGSI_WRITEMASK_X ||
543       writemask == TGSI_WRITEMASK_Y ||
544       writemask == TGSI_WRITEMASK_Z ||
545       writemask == TGSI_WRITEMASK_W ||
546       writemask == TGSI_WRITEMASK_NONE) {
547      /* no chance of data dependency */
548      return FALSE;
549   }
550
551   /* loop over src regs */
552   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
553      if ((inst->Src[i].Register.File ==
554           inst->Dst[0].Register.File) &&
555          (inst->Src[i].Register.Index ==
556           inst->Dst[0].Register.Index)) {
557         /* loop over dest channels */
558         uint channelsWritten = 0x0;
559         FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
560            /* check if we're reading a channel that's been written */
561            uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
562            if (channelsWritten & (1 << swizzle)) {
563               return TRUE;
564            }
565
566            channelsWritten |= (1 << chan);
567         }
568      }
569   }
570   return FALSE;
571}
572
573
574/**
575 * Initialize machine state by expanding tokens to full instructions,
576 * allocating temporary storage, setting up constants, etc.
577 * After this, we can call tgsi_exec_machine_run() many times.
578 */
579void
580tgsi_exec_machine_bind_shader(
581   struct tgsi_exec_machine *mach,
582   const struct tgsi_token *tokens,
583   uint numSamplers,
584   struct tgsi_sampler **samplers)
585{
586   uint k;
587   struct tgsi_parse_context parse;
588   struct tgsi_exec_labels *labels = &mach->Labels;
589   struct tgsi_full_instruction *instructions;
590   struct tgsi_full_declaration *declarations;
591   uint maxInstructions = 10, numInstructions = 0;
592   uint maxDeclarations = 10, numDeclarations = 0;
593   uint instno = 0;
594
595#if 0
596   tgsi_dump(tokens, 0);
597#endif
598
599   util_init_math();
600
601   mach->Tokens = tokens;
602   mach->Samplers = samplers;
603
604   k = tgsi_parse_init (&parse, mach->Tokens);
605   if (k != TGSI_PARSE_OK) {
606      debug_printf( "Problem parsing!\n" );
607      return;
608   }
609
610   mach->Processor = parse.FullHeader.Processor.Processor;
611   mach->ImmLimit = 0;
612   labels->count = 0;
613
614   declarations = (struct tgsi_full_declaration *)
615      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
616
617   if (!declarations) {
618      return;
619   }
620
621   instructions = (struct tgsi_full_instruction *)
622      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
623
624   if (!instructions) {
625      FREE( declarations );
626      return;
627   }
628
629   while( !tgsi_parse_end_of_tokens( &parse ) ) {
630      uint pointer = parse.Position;
631      uint i;
632
633      tgsi_parse_token( &parse );
634      switch( parse.FullToken.Token.Type ) {
635      case TGSI_TOKEN_TYPE_DECLARATION:
636         /* save expanded declaration */
637         if (numDeclarations == maxDeclarations) {
638            declarations = REALLOC(declarations,
639                                   maxDeclarations
640                                   * sizeof(struct tgsi_full_declaration),
641                                   (maxDeclarations + 10)
642                                   * sizeof(struct tgsi_full_declaration));
643            maxDeclarations += 10;
644         }
645         if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
646            unsigned reg;
647            for (reg = parse.FullToken.FullDeclaration.Range.First;
648                 reg <= parse.FullToken.FullDeclaration.Range.Last;
649                 ++reg) {
650               ++mach->NumOutputs;
651            }
652         }
653         memcpy(declarations + numDeclarations,
654                &parse.FullToken.FullDeclaration,
655                sizeof(declarations[0]));
656         numDeclarations++;
657         break;
658
659      case TGSI_TOKEN_TYPE_IMMEDIATE:
660         {
661            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
662            assert( size <= 4 );
663            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
664
665            for( i = 0; i < size; i++ ) {
666               mach->Imms[mach->ImmLimit][i] =
667		  parse.FullToken.FullImmediate.u[i].Float;
668            }
669            mach->ImmLimit += 1;
670         }
671         break;
672
673      case TGSI_TOKEN_TYPE_INSTRUCTION:
674         assert( labels->count < MAX_LABELS );
675
676         labels->labels[labels->count][0] = instno;
677         labels->labels[labels->count][1] = pointer;
678         labels->count++;
679
680         /* save expanded instruction */
681         if (numInstructions == maxInstructions) {
682            instructions = REALLOC(instructions,
683                                   maxInstructions
684                                   * sizeof(struct tgsi_full_instruction),
685                                   (maxInstructions + 10)
686                                   * sizeof(struct tgsi_full_instruction));
687            maxInstructions += 10;
688         }
689
690         memcpy(instructions + numInstructions,
691                &parse.FullToken.FullInstruction,
692                sizeof(instructions[0]));
693
694         numInstructions++;
695         break;
696
697      case TGSI_TOKEN_TYPE_PROPERTY:
698         break;
699
700      default:
701         assert( 0 );
702      }
703   }
704   tgsi_parse_free (&parse);
705
706   if (mach->Declarations) {
707      FREE( mach->Declarations );
708   }
709   mach->Declarations = declarations;
710   mach->NumDeclarations = numDeclarations;
711
712   if (mach->Instructions) {
713      FREE( mach->Instructions );
714   }
715   mach->Instructions = instructions;
716   mach->NumInstructions = numInstructions;
717}
718
719
720struct tgsi_exec_machine *
721tgsi_exec_machine_create( void )
722{
723   struct tgsi_exec_machine *mach;
724   uint i;
725
726   mach = align_malloc( sizeof *mach, 16 );
727   if (!mach)
728      goto fail;
729
730   memset(mach, 0, sizeof(*mach));
731
732   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
733   mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
734   mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
735
736   /* Setup constants. */
737   for( i = 0; i < 4; i++ ) {
738      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
739      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
740      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
741      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
742      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
743      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
744      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
745      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
746      mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
747      mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
748   }
749
750#ifdef DEBUG
751   /* silence warnings */
752   (void) print_chan;
753   (void) print_temp;
754#endif
755
756   return mach;
757
758fail:
759   align_free(mach);
760   return NULL;
761}
762
763
764void
765tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
766{
767   if (mach) {
768      FREE(mach->Instructions);
769      FREE(mach->Declarations);
770   }
771
772   align_free(mach);
773}
774
775static void
776micro_add(
777   union tgsi_exec_channel *dst,
778   const union tgsi_exec_channel *src0,
779   const union tgsi_exec_channel *src1 )
780{
781   dst->f[0] = src0->f[0] + src1->f[0];
782   dst->f[1] = src0->f[1] + src1->f[1];
783   dst->f[2] = src0->f[2] + src1->f[2];
784   dst->f[3] = src0->f[3] + src1->f[3];
785}
786
787static void
788micro_div(
789   union tgsi_exec_channel *dst,
790   const union tgsi_exec_channel *src0,
791   const union tgsi_exec_channel *src1 )
792{
793   if (src1->f[0] != 0) {
794      dst->f[0] = src0->f[0] / src1->f[0];
795   }
796   if (src1->f[1] != 0) {
797      dst->f[1] = src0->f[1] / src1->f[1];
798   }
799   if (src1->f[2] != 0) {
800      dst->f[2] = src0->f[2] / src1->f[2];
801   }
802   if (src1->f[3] != 0) {
803      dst->f[3] = src0->f[3] / src1->f[3];
804   }
805}
806
807static void
808micro_float_clamp(union tgsi_exec_channel *dst,
809                  const union tgsi_exec_channel *src)
810{
811   uint i;
812
813   for (i = 0; i < 4; i++) {
814      if (src->f[i] > 0.0f) {
815         if (src->f[i] > 1.884467e+019f)
816            dst->f[i] = 1.884467e+019f;
817         else if (src->f[i] < 5.42101e-020f)
818            dst->f[i] = 5.42101e-020f;
819         else
820            dst->f[i] = src->f[i];
821      }
822      else {
823         if (src->f[i] < -1.884467e+019f)
824            dst->f[i] = -1.884467e+019f;
825         else if (src->f[i] > -5.42101e-020f)
826            dst->f[i] = -5.42101e-020f;
827         else
828            dst->f[i] = src->f[i];
829      }
830   }
831}
832
833static void
834micro_lt(
835   union tgsi_exec_channel *dst,
836   const union tgsi_exec_channel *src0,
837   const union tgsi_exec_channel *src1,
838   const union tgsi_exec_channel *src2,
839   const union tgsi_exec_channel *src3 )
840{
841   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
842   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
843   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
844   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
845}
846
847static void
848micro_max(
849   union tgsi_exec_channel *dst,
850   const union tgsi_exec_channel *src0,
851   const union tgsi_exec_channel *src1 )
852{
853   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
854   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
855   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
856   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
857}
858
859static void
860micro_min(
861   union tgsi_exec_channel *dst,
862   const union tgsi_exec_channel *src0,
863   const union tgsi_exec_channel *src1 )
864{
865   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
866   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
867   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
868   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
869}
870
871static void
872micro_mul(
873   union tgsi_exec_channel *dst,
874   const union tgsi_exec_channel *src0,
875   const union tgsi_exec_channel *src1 )
876{
877   dst->f[0] = src0->f[0] * src1->f[0];
878   dst->f[1] = src0->f[1] * src1->f[1];
879   dst->f[2] = src0->f[2] * src1->f[2];
880   dst->f[3] = src0->f[3] * src1->f[3];
881}
882
883#if 0
884static void
885micro_imul64(
886   union tgsi_exec_channel *dst0,
887   union tgsi_exec_channel *dst1,
888   const union tgsi_exec_channel *src0,
889   const union tgsi_exec_channel *src1 )
890{
891   dst1->i[0] = src0->i[0] * src1->i[0];
892   dst1->i[1] = src0->i[1] * src1->i[1];
893   dst1->i[2] = src0->i[2] * src1->i[2];
894   dst1->i[3] = src0->i[3] * src1->i[3];
895   dst0->i[0] = 0;
896   dst0->i[1] = 0;
897   dst0->i[2] = 0;
898   dst0->i[3] = 0;
899}
900#endif
901
902#if 0
903static void
904micro_umul64(
905   union tgsi_exec_channel *dst0,
906   union tgsi_exec_channel *dst1,
907   const union tgsi_exec_channel *src0,
908   const union tgsi_exec_channel *src1 )
909{
910   dst1->u[0] = src0->u[0] * src1->u[0];
911   dst1->u[1] = src0->u[1] * src1->u[1];
912   dst1->u[2] = src0->u[2] * src1->u[2];
913   dst1->u[3] = src0->u[3] * src1->u[3];
914   dst0->u[0] = 0;
915   dst0->u[1] = 0;
916   dst0->u[2] = 0;
917   dst0->u[3] = 0;
918}
919#endif
920
921
922#if 0
923static void
924micro_movc(
925   union tgsi_exec_channel *dst,
926   const union tgsi_exec_channel *src0,
927   const union tgsi_exec_channel *src1,
928   const union tgsi_exec_channel *src2 )
929{
930   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
931   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
932   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
933   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
934}
935#endif
936
937static void
938micro_neg(
939   union tgsi_exec_channel *dst,
940   const union tgsi_exec_channel *src )
941{
942   dst->f[0] = -src->f[0];
943   dst->f[1] = -src->f[1];
944   dst->f[2] = -src->f[2];
945   dst->f[3] = -src->f[3];
946}
947
948static void
949micro_pow(
950   union tgsi_exec_channel *dst,
951   const union tgsi_exec_channel *src0,
952   const union tgsi_exec_channel *src1 )
953{
954#if FAST_MATH
955   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
956   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
957   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
958   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
959#else
960   dst->f[0] = powf( src0->f[0], src1->f[0] );
961   dst->f[1] = powf( src0->f[1], src1->f[1] );
962   dst->f[2] = powf( src0->f[2], src1->f[2] );
963   dst->f[3] = powf( src0->f[3], src1->f[3] );
964#endif
965}
966
967static void
968micro_sqrt( union tgsi_exec_channel *dst,
969            const union tgsi_exec_channel *src )
970{
971   dst->f[0] = sqrtf( src->f[0] );
972   dst->f[1] = sqrtf( src->f[1] );
973   dst->f[2] = sqrtf( src->f[2] );
974   dst->f[3] = sqrtf( src->f[3] );
975}
976
977static void
978micro_sub(
979   union tgsi_exec_channel *dst,
980   const union tgsi_exec_channel *src0,
981   const union tgsi_exec_channel *src1 )
982{
983   dst->f[0] = src0->f[0] - src1->f[0];
984   dst->f[1] = src0->f[1] - src1->f[1];
985   dst->f[2] = src0->f[2] - src1->f[2];
986   dst->f[3] = src0->f[3] - src1->f[3];
987}
988
989static void
990fetch_src_file_channel(const struct tgsi_exec_machine *mach,
991                       const uint file,
992                       const uint swizzle,
993                       const union tgsi_exec_channel *index,
994                       const union tgsi_exec_channel *index2D,
995                       union tgsi_exec_channel *chan)
996{
997   uint i;
998
999   switch (file) {
1000   case TGSI_FILE_CONSTANT:
1001      for (i = 0; i < QUAD_SIZE; i++) {
1002         assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
1003         assert(mach->Consts[index2D->i[i]]);
1004
1005         if (index->i[i] < 0) {
1006            chan->u[i] = 0;
1007         } else {
1008            const uint *p = (const uint *)mach->Consts[index2D->i[i]];
1009
1010            chan->u[i] = p[index->i[i] * 4 + swizzle];
1011         }
1012      }
1013      break;
1014
1015   case TGSI_FILE_INPUT:
1016   case TGSI_FILE_SYSTEM_VALUE:
1017      for (i = 0; i < QUAD_SIZE; i++) {
1018         /* XXX: 2D indexing */
1019         chan->u[i] = mach->Inputs[index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i]].xyzw[swizzle].u[i];
1020      }
1021      break;
1022
1023   case TGSI_FILE_TEMPORARY:
1024      for (i = 0; i < QUAD_SIZE; i++) {
1025         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1026         assert(index2D->i[i] == 0);
1027
1028         chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1029      }
1030      break;
1031
1032   case TGSI_FILE_IMMEDIATE:
1033      for (i = 0; i < QUAD_SIZE; i++) {
1034         assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1035         assert(index2D->i[i] == 0);
1036
1037         chan->f[i] = mach->Imms[index->i[i]][swizzle];
1038      }
1039      break;
1040
1041   case TGSI_FILE_ADDRESS:
1042      for (i = 0; i < QUAD_SIZE; i++) {
1043         assert(index->i[i] >= 0);
1044         assert(index2D->i[i] == 0);
1045
1046         chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1047      }
1048      break;
1049
1050   case TGSI_FILE_PREDICATE:
1051      for (i = 0; i < QUAD_SIZE; i++) {
1052         assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
1053         assert(index2D->i[i] == 0);
1054
1055         chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
1056      }
1057      break;
1058
1059   case TGSI_FILE_OUTPUT:
1060      /* vertex/fragment output vars can be read too */
1061      for (i = 0; i < QUAD_SIZE; i++) {
1062         assert(index->i[i] >= 0);
1063         assert(index2D->i[i] == 0);
1064
1065         chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1066      }
1067      break;
1068
1069   default:
1070      assert(0);
1071      for (i = 0; i < QUAD_SIZE; i++) {
1072         chan->u[i] = 0;
1073      }
1074   }
1075}
1076
1077static void
1078fetch_source(const struct tgsi_exec_machine *mach,
1079             union tgsi_exec_channel *chan,
1080             const struct tgsi_full_src_register *reg,
1081             const uint chan_index,
1082             enum tgsi_exec_datatype src_datatype)
1083{
1084   union tgsi_exec_channel index;
1085   union tgsi_exec_channel index2D;
1086   uint swizzle;
1087
1088   /* We start with a direct index into a register file.
1089    *
1090    *    file[1],
1091    *    where:
1092    *       file = Register.File
1093    *       [1] = Register.Index
1094    */
1095   index.i[0] =
1096   index.i[1] =
1097   index.i[2] =
1098   index.i[3] = reg->Register.Index;
1099
1100   /* There is an extra source register that indirectly subscripts
1101    * a register file. The direct index now becomes an offset
1102    * that is being added to the indirect register.
1103    *
1104    *    file[ind[2].x+1],
1105    *    where:
1106    *       ind = Indirect.File
1107    *       [2] = Indirect.Index
1108    *       .x = Indirect.SwizzleX
1109    */
1110   if (reg->Register.Indirect) {
1111      union tgsi_exec_channel index2;
1112      union tgsi_exec_channel indir_index;
1113      const uint execmask = mach->ExecMask;
1114      uint i;
1115
1116      /* which address register (always zero now) */
1117      index2.i[0] =
1118      index2.i[1] =
1119      index2.i[2] =
1120      index2.i[3] = reg->Indirect.Index;
1121
1122      /* get current value of address register[swizzle] */
1123      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1124      fetch_src_file_channel(mach,
1125                             reg->Indirect.File,
1126                             swizzle,
1127                             &index2,
1128                             &ZeroVec,
1129                             &indir_index);
1130
1131      /* add value of address register to the offset */
1132      index.i[0] += indir_index.i[0];
1133      index.i[1] += indir_index.i[1];
1134      index.i[2] += indir_index.i[2];
1135      index.i[3] += indir_index.i[3];
1136
1137      /* for disabled execution channels, zero-out the index to
1138       * avoid using a potential garbage value.
1139       */
1140      for (i = 0; i < QUAD_SIZE; i++) {
1141         if ((execmask & (1 << i)) == 0)
1142            index.i[i] = 0;
1143      }
1144   }
1145
1146   /* There is an extra source register that is a second
1147    * subscript to a register file. Effectively it means that
1148    * the register file is actually a 2D array of registers.
1149    *
1150    *    file[3][1],
1151    *    where:
1152    *       [3] = Dimension.Index
1153    */
1154   if (reg->Register.Dimension) {
1155      index2D.i[0] =
1156      index2D.i[1] =
1157      index2D.i[2] =
1158      index2D.i[3] = reg->Dimension.Index;
1159
1160      /* Again, the second subscript index can be addressed indirectly
1161       * identically to the first one.
1162       * Nothing stops us from indirectly addressing the indirect register,
1163       * but there is no need for that, so we won't exercise it.
1164       *
1165       *    file[ind[4].y+3][1],
1166       *    where:
1167       *       ind = DimIndirect.File
1168       *       [4] = DimIndirect.Index
1169       *       .y = DimIndirect.SwizzleX
1170       */
1171      if (reg->Dimension.Indirect) {
1172         union tgsi_exec_channel index2;
1173         union tgsi_exec_channel indir_index;
1174         const uint execmask = mach->ExecMask;
1175         uint i;
1176
1177         index2.i[0] =
1178         index2.i[1] =
1179         index2.i[2] =
1180         index2.i[3] = reg->DimIndirect.Index;
1181
1182         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1183         fetch_src_file_channel(mach,
1184                                reg->DimIndirect.File,
1185                                swizzle,
1186                                &index2,
1187                                &ZeroVec,
1188                                &indir_index);
1189
1190         index2D.i[0] += indir_index.i[0];
1191         index2D.i[1] += indir_index.i[1];
1192         index2D.i[2] += indir_index.i[2];
1193         index2D.i[3] += indir_index.i[3];
1194
1195         /* for disabled execution channels, zero-out the index to
1196          * avoid using a potential garbage value.
1197          */
1198         for (i = 0; i < QUAD_SIZE; i++) {
1199            if ((execmask & (1 << i)) == 0) {
1200               index2D.i[i] = 0;
1201            }
1202         }
1203      }
1204
1205      /* If by any chance there was a need for a 3D array of register
1206       * files, we would have to check whether Dimension is followed
1207       * by a dimension register and continue the saga.
1208       */
1209   } else {
1210      index2D.i[0] =
1211      index2D.i[1] =
1212      index2D.i[2] =
1213      index2D.i[3] = 0;
1214   }
1215
1216   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1217   fetch_src_file_channel(mach,
1218                          reg->Register.File,
1219                          swizzle,
1220                          &index,
1221                          &index2D,
1222                          chan);
1223
1224   if (reg->Register.Absolute) {
1225      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1226         micro_abs(chan, chan);
1227      } else {
1228         micro_iabs(chan, chan);
1229      }
1230   }
1231
1232   if (reg->Register.Negate) {
1233      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1234         micro_neg(chan, chan);
1235      } else {
1236         micro_ineg(chan, chan);
1237      }
1238   }
1239}
1240
1241static void
1242store_dest(struct tgsi_exec_machine *mach,
1243           const union tgsi_exec_channel *chan,
1244           const struct tgsi_full_dst_register *reg,
1245           const struct tgsi_full_instruction *inst,
1246           uint chan_index,
1247           enum tgsi_exec_datatype dst_datatype)
1248{
1249   uint i;
1250   union tgsi_exec_channel null;
1251   union tgsi_exec_channel *dst;
1252   uint execmask = mach->ExecMask;
1253   int offset = 0;  /* indirection offset */
1254   int index;
1255
1256   /* for debugging */
1257   if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1258      check_inf_or_nan(chan);
1259   }
1260
1261   /* There is an extra source register that indirectly subscripts
1262    * a register file. The direct index now becomes an offset
1263    * that is being added to the indirect register.
1264    *
1265    *    file[ind[2].x+1],
1266    *    where:
1267    *       ind = Indirect.File
1268    *       [2] = Indirect.Index
1269    *       .x = Indirect.SwizzleX
1270    */
1271   if (reg->Register.Indirect) {
1272      union tgsi_exec_channel index;
1273      union tgsi_exec_channel indir_index;
1274      uint swizzle;
1275
1276      /* which address register (always zero for now) */
1277      index.i[0] =
1278      index.i[1] =
1279      index.i[2] =
1280      index.i[3] = reg->Indirect.Index;
1281
1282      /* get current value of address register[swizzle] */
1283      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1284
1285      /* fetch values from the address/indirection register */
1286      fetch_src_file_channel(mach,
1287                             reg->Indirect.File,
1288                             swizzle,
1289                             &index,
1290                             &ZeroVec,
1291                             &indir_index);
1292
1293      /* save indirection offset */
1294      offset = indir_index.i[0];
1295   }
1296
1297   switch (reg->Register.File) {
1298   case TGSI_FILE_NULL:
1299      dst = &null;
1300      break;
1301
1302   case TGSI_FILE_OUTPUT:
1303      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1304         + reg->Register.Index;
1305      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1306#if 0
1307      if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1308         fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1309         for (i = 0; i < QUAD_SIZE; i++)
1310            if (execmask & (1 << i))
1311               fprintf(stderr, "%f, ", chan->f[i]);
1312         fprintf(stderr, ")\n");
1313      }
1314#endif
1315      break;
1316
1317   case TGSI_FILE_TEMPORARY:
1318      index = reg->Register.Index;
1319      assert( index < TGSI_EXEC_NUM_TEMPS );
1320      dst = &mach->Temps[offset + index].xyzw[chan_index];
1321      break;
1322
1323   case TGSI_FILE_ADDRESS:
1324      index = reg->Register.Index;
1325      dst = &mach->Addrs[index].xyzw[chan_index];
1326      break;
1327
1328   case TGSI_FILE_LOOP:
1329      assert(reg->Register.Index == 0);
1330      assert(mach->LoopCounterStackTop > 0);
1331      assert(chan_index == CHAN_X);
1332      dst = &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[chan_index];
1333      break;
1334
1335   case TGSI_FILE_PREDICATE:
1336      index = reg->Register.Index;
1337      assert(index < TGSI_EXEC_NUM_PREDS);
1338      dst = &mach->Predicates[index].xyzw[chan_index];
1339      break;
1340
1341   default:
1342      assert( 0 );
1343      return;
1344   }
1345
1346   if (inst->Instruction.Predicate) {
1347      uint swizzle;
1348      union tgsi_exec_channel *pred;
1349
1350      switch (chan_index) {
1351      case CHAN_X:
1352         swizzle = inst->Predicate.SwizzleX;
1353         break;
1354      case CHAN_Y:
1355         swizzle = inst->Predicate.SwizzleY;
1356         break;
1357      case CHAN_Z:
1358         swizzle = inst->Predicate.SwizzleZ;
1359         break;
1360      case CHAN_W:
1361         swizzle = inst->Predicate.SwizzleW;
1362         break;
1363      default:
1364         assert(0);
1365         return;
1366      }
1367
1368      assert(inst->Predicate.Index == 0);
1369
1370      pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1371
1372      if (inst->Predicate.Negate) {
1373         for (i = 0; i < QUAD_SIZE; i++) {
1374            if (pred->u[i]) {
1375               execmask &= ~(1 << i);
1376            }
1377         }
1378      } else {
1379         for (i = 0; i < QUAD_SIZE; i++) {
1380            if (!pred->u[i]) {
1381               execmask &= ~(1 << i);
1382            }
1383         }
1384      }
1385   }
1386
1387   switch (inst->Instruction.Saturate) {
1388   case TGSI_SAT_NONE:
1389      for (i = 0; i < QUAD_SIZE; i++)
1390         if (execmask & (1 << i))
1391            dst->i[i] = chan->i[i];
1392      break;
1393
1394   case TGSI_SAT_ZERO_ONE:
1395      for (i = 0; i < QUAD_SIZE; i++)
1396         if (execmask & (1 << i)) {
1397            if (chan->f[i] < 0.0f)
1398               dst->f[i] = 0.0f;
1399            else if (chan->f[i] > 1.0f)
1400               dst->f[i] = 1.0f;
1401            else
1402               dst->i[i] = chan->i[i];
1403         }
1404      break;
1405
1406   case TGSI_SAT_MINUS_PLUS_ONE:
1407      for (i = 0; i < QUAD_SIZE; i++)
1408         if (execmask & (1 << i)) {
1409            if (chan->f[i] < -1.0f)
1410               dst->f[i] = -1.0f;
1411            else if (chan->f[i] > 1.0f)
1412               dst->f[i] = 1.0f;
1413            else
1414               dst->i[i] = chan->i[i];
1415         }
1416      break;
1417
1418   default:
1419      assert( 0 );
1420   }
1421}
1422
1423#define FETCH(VAL,INDEX,CHAN)\
1424    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1425
1426#define STORE(VAL,INDEX,CHAN)\
1427   store_dest(mach, VAL, &inst->Dst[INDEX], inst, CHAN, TGSI_EXEC_DATA_FLOAT)
1428
1429
1430/**
1431 * Execute ARB-style KIL which is predicated by a src register.
1432 * Kill fragment if any of the four values is less than zero.
1433 */
1434static void
1435exec_kil(struct tgsi_exec_machine *mach,
1436         const struct tgsi_full_instruction *inst)
1437{
1438   uint uniquemask;
1439   uint chan_index;
1440   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1441   union tgsi_exec_channel r[1];
1442
1443   /* This mask stores component bits that were already tested. */
1444   uniquemask = 0;
1445
1446   for (chan_index = 0; chan_index < 4; chan_index++)
1447   {
1448      uint swizzle;
1449      uint i;
1450
1451      /* unswizzle channel */
1452      swizzle = tgsi_util_get_full_src_register_swizzle (
1453                        &inst->Src[0],
1454                        chan_index);
1455
1456      /* check if the component has not been already tested */
1457      if (uniquemask & (1 << swizzle))
1458         continue;
1459      uniquemask |= 1 << swizzle;
1460
1461      FETCH(&r[0], 0, chan_index);
1462      for (i = 0; i < 4; i++)
1463         if (r[0].f[i] < 0.0f)
1464            kilmask |= 1 << i;
1465   }
1466
1467   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1468}
1469
1470/**
1471 * Execute NVIDIA-style KIL which is predicated by a condition code.
1472 * Kill fragment if the condition code is TRUE.
1473 */
1474static void
1475exec_kilp(struct tgsi_exec_machine *mach,
1476          const struct tgsi_full_instruction *inst)
1477{
1478   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1479
1480   /* "unconditional" kil */
1481   kilmask = mach->ExecMask;
1482   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1483}
1484
1485static void
1486emit_vertex(struct tgsi_exec_machine *mach)
1487{
1488   /* FIXME: check for exec mask correctly
1489   unsigned i;
1490   for (i = 0; i < QUAD_SIZE; ++i) {
1491         if ((mach->ExecMask & (1 << i)))
1492   */
1493   if (mach->ExecMask) {
1494      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1495      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1496   }
1497}
1498
1499static void
1500emit_primitive(struct tgsi_exec_machine *mach)
1501{
1502   unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1503   /* FIXME: check for exec mask correctly
1504   unsigned i;
1505   for (i = 0; i < QUAD_SIZE; ++i) {
1506         if ((mach->ExecMask & (1 << i)))
1507   */
1508   if (mach->ExecMask) {
1509      ++(*prim_count);
1510      debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1511      mach->Primitives[*prim_count] = 0;
1512   }
1513}
1514
1515/*
1516 * Fetch four texture samples using STR texture coordinates.
1517 */
1518static void
1519fetch_texel( struct tgsi_sampler *sampler,
1520             const union tgsi_exec_channel *s,
1521             const union tgsi_exec_channel *t,
1522             const union tgsi_exec_channel *p,
1523             const union tgsi_exec_channel *c0,
1524             enum tgsi_sampler_control control,
1525             union tgsi_exec_channel *r,
1526             union tgsi_exec_channel *g,
1527             union tgsi_exec_channel *b,
1528             union tgsi_exec_channel *a )
1529{
1530   uint j;
1531   float rgba[NUM_CHANNELS][QUAD_SIZE];
1532
1533   sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
1534
1535   for (j = 0; j < 4; j++) {
1536      r->f[j] = rgba[0][j];
1537      g->f[j] = rgba[1][j];
1538      b->f[j] = rgba[2][j];
1539      a->f[j] = rgba[3][j];
1540   }
1541}
1542
1543
1544#define TEX_MODIFIER_NONE           0
1545#define TEX_MODIFIER_PROJECTED      1
1546#define TEX_MODIFIER_LOD_BIAS       2
1547#define TEX_MODIFIER_EXPLICIT_LOD   3
1548
1549
1550static void
1551exec_tex(struct tgsi_exec_machine *mach,
1552         const struct tgsi_full_instruction *inst,
1553         uint modifier)
1554{
1555   const uint unit = inst->Src[1].Register.Index;
1556   union tgsi_exec_channel r[4];
1557   const union tgsi_exec_channel *lod = &ZeroVec;
1558   enum tgsi_sampler_control control;
1559   uint chan_index;
1560
1561   if (modifier != TEX_MODIFIER_NONE) {
1562      FETCH(&r[3], 0, CHAN_W);
1563      if (modifier != TEX_MODIFIER_PROJECTED) {
1564         lod = &r[3];
1565      }
1566   }
1567
1568   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
1569      control = tgsi_sampler_lod_explicit;
1570   } else {
1571      control = tgsi_sampler_lod_bias;
1572   }
1573
1574   switch (inst->Texture.Texture) {
1575   case TGSI_TEXTURE_1D:
1576   case TGSI_TEXTURE_SHADOW1D:
1577      FETCH(&r[0], 0, CHAN_X);
1578
1579      if (modifier == TEX_MODIFIER_PROJECTED) {
1580         micro_div(&r[0], &r[0], &r[3]);
1581      }
1582
1583      fetch_texel(mach->Samplers[unit],
1584                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
1585                  control,
1586                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1587      break;
1588
1589   case TGSI_TEXTURE_2D:
1590   case TGSI_TEXTURE_RECT:
1591   case TGSI_TEXTURE_SHADOW2D:
1592   case TGSI_TEXTURE_SHADOWRECT:
1593      FETCH(&r[0], 0, CHAN_X);
1594      FETCH(&r[1], 0, CHAN_Y);
1595      FETCH(&r[2], 0, CHAN_Z);
1596
1597      if (modifier == TEX_MODIFIER_PROJECTED) {
1598         micro_div(&r[0], &r[0], &r[3]);
1599         micro_div(&r[1], &r[1], &r[3]);
1600         micro_div(&r[2], &r[2], &r[3]);
1601      }
1602
1603      fetch_texel(mach->Samplers[unit],
1604                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1605                  control,
1606                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1607      break;
1608
1609   case TGSI_TEXTURE_3D:
1610   case TGSI_TEXTURE_CUBE:
1611      FETCH(&r[0], 0, CHAN_X);
1612      FETCH(&r[1], 0, CHAN_Y);
1613      FETCH(&r[2], 0, CHAN_Z);
1614
1615      if (modifier == TEX_MODIFIER_PROJECTED) {
1616         micro_div(&r[0], &r[0], &r[3]);
1617         micro_div(&r[1], &r[1], &r[3]);
1618         micro_div(&r[2], &r[2], &r[3]);
1619      }
1620
1621      fetch_texel(mach->Samplers[unit],
1622                  &r[0], &r[1], &r[2], lod,
1623                  control,
1624                  &r[0], &r[1], &r[2], &r[3]);
1625      break;
1626
1627   default:
1628      assert(0);
1629   }
1630
1631   FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1632      STORE(&r[chan_index], 0, chan_index);
1633   }
1634}
1635
1636static void
1637exec_txd(struct tgsi_exec_machine *mach,
1638         const struct tgsi_full_instruction *inst)
1639{
1640   const uint unit = inst->Src[3].Register.Index;
1641   union tgsi_exec_channel r[4];
1642   uint chan_index;
1643
1644   /*
1645    * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1646    */
1647
1648   switch (inst->Texture.Texture) {
1649   case TGSI_TEXTURE_1D:
1650   case TGSI_TEXTURE_SHADOW1D:
1651
1652      FETCH(&r[0], 0, CHAN_X);
1653
1654      fetch_texel(mach->Samplers[unit],
1655                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
1656                  tgsi_sampler_lod_bias,
1657                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
1658      break;
1659
1660   case TGSI_TEXTURE_2D:
1661   case TGSI_TEXTURE_RECT:
1662   case TGSI_TEXTURE_SHADOW2D:
1663   case TGSI_TEXTURE_SHADOWRECT:
1664
1665      FETCH(&r[0], 0, CHAN_X);
1666      FETCH(&r[1], 0, CHAN_Y);
1667      FETCH(&r[2], 0, CHAN_Z);
1668
1669      fetch_texel(mach->Samplers[unit],
1670                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
1671                  tgsi_sampler_lod_bias,
1672                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
1673      break;
1674
1675   case TGSI_TEXTURE_3D:
1676   case TGSI_TEXTURE_CUBE:
1677
1678      FETCH(&r[0], 0, CHAN_X);
1679      FETCH(&r[1], 0, CHAN_Y);
1680      FETCH(&r[2], 0, CHAN_Z);
1681
1682      fetch_texel(mach->Samplers[unit],
1683                  &r[0], &r[1], &r[2], &ZeroVec,
1684                  tgsi_sampler_lod_bias,
1685                  &r[0], &r[1], &r[2], &r[3]);
1686      break;
1687
1688   default:
1689      assert(0);
1690   }
1691
1692   FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1693      STORE(&r[chan_index], 0, chan_index);
1694   }
1695}
1696
1697
1698/**
1699 * Evaluate a constant-valued coefficient at the position of the
1700 * current quad.
1701 */
1702static void
1703eval_constant_coef(
1704   struct tgsi_exec_machine *mach,
1705   unsigned attrib,
1706   unsigned chan )
1707{
1708   unsigned i;
1709
1710   for( i = 0; i < QUAD_SIZE; i++ ) {
1711      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1712   }
1713}
1714
1715/**
1716 * Evaluate a linear-valued coefficient at the position of the
1717 * current quad.
1718 */
1719static void
1720eval_linear_coef(
1721   struct tgsi_exec_machine *mach,
1722   unsigned attrib,
1723   unsigned chan )
1724{
1725   const float x = mach->QuadPos.xyzw[0].f[0];
1726   const float y = mach->QuadPos.xyzw[1].f[0];
1727   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1728   const float dady = mach->InterpCoefs[attrib].dady[chan];
1729   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1730   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1731   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1732   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1733   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1734}
1735
1736/**
1737 * Evaluate a perspective-valued coefficient at the position of the
1738 * current quad.
1739 */
1740static void
1741eval_perspective_coef(
1742   struct tgsi_exec_machine *mach,
1743   unsigned attrib,
1744   unsigned chan )
1745{
1746   const float x = mach->QuadPos.xyzw[0].f[0];
1747   const float y = mach->QuadPos.xyzw[1].f[0];
1748   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1749   const float dady = mach->InterpCoefs[attrib].dady[chan];
1750   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1751   const float *w = mach->QuadPos.xyzw[3].f;
1752   /* divide by W here */
1753   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1754   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1755   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1756   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1757}
1758
1759
1760typedef void (* eval_coef_func)(
1761   struct tgsi_exec_machine *mach,
1762   unsigned attrib,
1763   unsigned chan );
1764
1765static void
1766exec_declaration(struct tgsi_exec_machine *mach,
1767                 const struct tgsi_full_declaration *decl)
1768{
1769   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
1770      if (decl->Declaration.File == TGSI_FILE_INPUT ||
1771          decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1772         uint first, last, mask;
1773
1774         first = decl->Range.First;
1775         last = decl->Range.Last;
1776         mask = decl->Declaration.UsageMask;
1777
1778         if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
1779            uint i;
1780
1781            assert(decl->Semantic.Index == 0);
1782            assert(first == last);
1783
1784            for (i = 0; i < QUAD_SIZE; i++) {
1785               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
1786            }
1787         } else {
1788            eval_coef_func eval;
1789            uint i, j;
1790
1791            switch (decl->Declaration.Interpolate) {
1792            case TGSI_INTERPOLATE_CONSTANT:
1793               eval = eval_constant_coef;
1794               break;
1795
1796            case TGSI_INTERPOLATE_LINEAR:
1797               eval = eval_linear_coef;
1798               break;
1799
1800            case TGSI_INTERPOLATE_PERSPECTIVE:
1801               eval = eval_perspective_coef;
1802               break;
1803
1804            default:
1805               assert(0);
1806               return;
1807            }
1808
1809            for (j = 0; j < NUM_CHANNELS; j++) {
1810               if (mask & (1 << j)) {
1811                  for (i = first; i <= last; i++) {
1812                     eval(mach, i, j);
1813                  }
1814               }
1815            }
1816         }
1817      }
1818   }
1819}
1820
1821typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
1822                                const union tgsi_exec_channel *src);
1823
1824static void
1825exec_scalar_unary(struct tgsi_exec_machine *mach,
1826                  const struct tgsi_full_instruction *inst,
1827                  micro_unary_op op,
1828                  enum tgsi_exec_datatype dst_datatype,
1829                  enum tgsi_exec_datatype src_datatype)
1830{
1831   unsigned int chan;
1832   union tgsi_exec_channel src;
1833   union tgsi_exec_channel dst;
1834
1835   fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
1836   op(&dst, &src);
1837   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1838      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1839         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
1840      }
1841   }
1842}
1843
1844static void
1845exec_vector_unary(struct tgsi_exec_machine *mach,
1846                  const struct tgsi_full_instruction *inst,
1847                  micro_unary_op op,
1848                  enum tgsi_exec_datatype dst_datatype,
1849                  enum tgsi_exec_datatype src_datatype)
1850{
1851   unsigned int chan;
1852   struct tgsi_exec_vector dst;
1853
1854   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1855      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1856         union tgsi_exec_channel src;
1857
1858         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
1859         op(&dst.xyzw[chan], &src);
1860      }
1861   }
1862   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1863      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1864         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1865      }
1866   }
1867}
1868
1869typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
1870                                 const union tgsi_exec_channel *src0,
1871                                 const union tgsi_exec_channel *src1);
1872
1873static void
1874exec_vector_binary(struct tgsi_exec_machine *mach,
1875                   const struct tgsi_full_instruction *inst,
1876                   micro_binary_op op,
1877                   enum tgsi_exec_datatype dst_datatype,
1878                   enum tgsi_exec_datatype src_datatype)
1879{
1880   unsigned int chan;
1881   struct tgsi_exec_vector dst;
1882
1883   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1884      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1885         union tgsi_exec_channel src[2];
1886
1887         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
1888         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
1889         op(&dst.xyzw[chan], &src[0], &src[1]);
1890      }
1891   }
1892   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1893      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1894         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1895      }
1896   }
1897}
1898
1899typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
1900                                  const union tgsi_exec_channel *src0,
1901                                  const union tgsi_exec_channel *src1,
1902                                  const union tgsi_exec_channel *src2);
1903
1904static void
1905exec_vector_trinary(struct tgsi_exec_machine *mach,
1906                    const struct tgsi_full_instruction *inst,
1907                    micro_trinary_op op,
1908                    enum tgsi_exec_datatype dst_datatype,
1909                    enum tgsi_exec_datatype src_datatype)
1910{
1911   unsigned int chan;
1912   struct tgsi_exec_vector dst;
1913
1914   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1915      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1916         union tgsi_exec_channel src[3];
1917
1918         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
1919         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
1920         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
1921         op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
1922      }
1923   }
1924   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1925      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1926         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1927      }
1928   }
1929}
1930
1931static void
1932exec_dp3(struct tgsi_exec_machine *mach,
1933         const struct tgsi_full_instruction *inst)
1934{
1935   unsigned int chan;
1936   union tgsi_exec_channel arg[3];
1937
1938   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1939   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1940   micro_mul(&arg[2], &arg[0], &arg[1]);
1941
1942   for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
1943      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
1944      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
1945      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
1946   }
1947
1948   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1949      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1950         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1951      }
1952   }
1953}
1954
1955static void
1956exec_dp4(struct tgsi_exec_machine *mach,
1957         const struct tgsi_full_instruction *inst)
1958{
1959   unsigned int chan;
1960   union tgsi_exec_channel arg[3];
1961
1962   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1963   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1964   micro_mul(&arg[2], &arg[0], &arg[1]);
1965
1966   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
1967      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
1968      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
1969      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
1970   }
1971
1972   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1973      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1974         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1975      }
1976   }
1977}
1978
1979static void
1980exec_dp2a(struct tgsi_exec_machine *mach,
1981          const struct tgsi_full_instruction *inst)
1982{
1983   unsigned int chan;
1984   union tgsi_exec_channel arg[3];
1985
1986   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1987   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1988   micro_mul(&arg[2], &arg[0], &arg[1]);
1989
1990   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1991   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1992   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
1993
1994   fetch_source(mach, &arg[1], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1995   micro_add(&arg[0], &arg[0], &arg[1]);
1996
1997   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1998      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1999         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2000      }
2001   }
2002}
2003
2004static void
2005exec_dph(struct tgsi_exec_machine *mach,
2006         const struct tgsi_full_instruction *inst)
2007{
2008   unsigned int chan;
2009   union tgsi_exec_channel arg[3];
2010
2011   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2012   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2013   micro_mul(&arg[2], &arg[0], &arg[1]);
2014
2015   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2016   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2017   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2018
2019   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2020   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2021   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2022
2023   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2024   micro_add(&arg[0], &arg[0], &arg[1]);
2025
2026   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2027      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2028         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2029      }
2030   }
2031}
2032
2033static void
2034exec_dp2(struct tgsi_exec_machine *mach,
2035         const struct tgsi_full_instruction *inst)
2036{
2037   unsigned int chan;
2038   union tgsi_exec_channel arg[3];
2039
2040   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2041   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2042   micro_mul(&arg[2], &arg[0], &arg[1]);
2043
2044   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2045   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2046   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2047
2048   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2049      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2050         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2051      }
2052   }
2053}
2054
2055static void
2056exec_nrm4(struct tgsi_exec_machine *mach,
2057          const struct tgsi_full_instruction *inst)
2058{
2059   unsigned int chan;
2060   union tgsi_exec_channel arg[4];
2061   union tgsi_exec_channel scale;
2062
2063   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2064   micro_mul(&scale, &arg[0], &arg[0]);
2065
2066   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2067      union tgsi_exec_channel product;
2068
2069      fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2070      micro_mul(&product, &arg[chan], &arg[chan]);
2071      micro_add(&scale, &scale, &product);
2072   }
2073
2074   micro_rsq(&scale, &scale);
2075
2076   for (chan = CHAN_X; chan <= CHAN_W; chan++) {
2077      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2078         micro_mul(&arg[chan], &arg[chan], &scale);
2079         store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2080      }
2081   }
2082}
2083
2084static void
2085exec_nrm3(struct tgsi_exec_machine *mach,
2086          const struct tgsi_full_instruction *inst)
2087{
2088   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2089      unsigned int chan;
2090      union tgsi_exec_channel arg[3];
2091      union tgsi_exec_channel scale;
2092
2093      fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2094      micro_mul(&scale, &arg[0], &arg[0]);
2095
2096      for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2097         union tgsi_exec_channel product;
2098
2099         fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2100         micro_mul(&product, &arg[chan], &arg[chan]);
2101         micro_add(&scale, &scale, &product);
2102      }
2103
2104      micro_rsq(&scale, &scale);
2105
2106      for (chan = CHAN_X; chan <= CHAN_Z; chan++) {
2107         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2108            micro_mul(&arg[chan], &arg[chan], &scale);
2109            store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2110         }
2111      }
2112   }
2113
2114   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2115      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2116   }
2117}
2118
2119static void
2120exec_break(struct tgsi_exec_machine *mach)
2121{
2122   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
2123      /* turn off loop channels for each enabled exec channel */
2124      mach->LoopMask &= ~mach->ExecMask;
2125      /* Todo: if mach->LoopMask == 0, jump to end of loop */
2126      UPDATE_EXEC_MASK(mach);
2127   } else {
2128      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
2129
2130      mach->Switch.mask = 0x0;
2131
2132      UPDATE_EXEC_MASK(mach);
2133   }
2134}
2135
2136static void
2137exec_switch(struct tgsi_exec_machine *mach,
2138            const struct tgsi_full_instruction *inst)
2139{
2140   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
2141   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
2142
2143   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
2144   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2145   mach->Switch.mask = 0x0;
2146   mach->Switch.defaultMask = 0x0;
2147
2148   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
2149   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
2150
2151   UPDATE_EXEC_MASK(mach);
2152}
2153
2154static void
2155exec_case(struct tgsi_exec_machine *mach,
2156          const struct tgsi_full_instruction *inst)
2157{
2158   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2159   union tgsi_exec_channel src;
2160   uint mask = 0;
2161
2162   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2163
2164   if (mach->Switch.selector.u[0] == src.u[0]) {
2165      mask |= 0x1;
2166   }
2167   if (mach->Switch.selector.u[1] == src.u[1]) {
2168      mask |= 0x2;
2169   }
2170   if (mach->Switch.selector.u[2] == src.u[2]) {
2171      mask |= 0x4;
2172   }
2173   if (mach->Switch.selector.u[3] == src.u[3]) {
2174      mask |= 0x8;
2175   }
2176
2177   mach->Switch.defaultMask |= mask;
2178
2179   mach->Switch.mask |= mask & prevMask;
2180
2181   UPDATE_EXEC_MASK(mach);
2182}
2183
2184static void
2185exec_default(struct tgsi_exec_machine *mach)
2186{
2187   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2188
2189   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
2190
2191   UPDATE_EXEC_MASK(mach);
2192}
2193
2194static void
2195exec_endswitch(struct tgsi_exec_machine *mach)
2196{
2197   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
2198   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
2199
2200   UPDATE_EXEC_MASK(mach);
2201}
2202
2203static void
2204micro_i2f(union tgsi_exec_channel *dst,
2205          const union tgsi_exec_channel *src)
2206{
2207   dst->f[0] = (float)src->i[0];
2208   dst->f[1] = (float)src->i[1];
2209   dst->f[2] = (float)src->i[2];
2210   dst->f[3] = (float)src->i[3];
2211}
2212
2213static void
2214micro_not(union tgsi_exec_channel *dst,
2215          const union tgsi_exec_channel *src)
2216{
2217   dst->u[0] = ~src->u[0];
2218   dst->u[1] = ~src->u[1];
2219   dst->u[2] = ~src->u[2];
2220   dst->u[3] = ~src->u[3];
2221}
2222
2223static void
2224micro_shl(union tgsi_exec_channel *dst,
2225          const union tgsi_exec_channel *src0,
2226          const union tgsi_exec_channel *src1)
2227{
2228   dst->u[0] = src0->u[0] << src1->u[0];
2229   dst->u[1] = src0->u[1] << src1->u[1];
2230   dst->u[2] = src0->u[2] << src1->u[2];
2231   dst->u[3] = src0->u[3] << src1->u[3];
2232}
2233
2234static void
2235micro_and(union tgsi_exec_channel *dst,
2236          const union tgsi_exec_channel *src0,
2237          const union tgsi_exec_channel *src1)
2238{
2239   dst->u[0] = src0->u[0] & src1->u[0];
2240   dst->u[1] = src0->u[1] & src1->u[1];
2241   dst->u[2] = src0->u[2] & src1->u[2];
2242   dst->u[3] = src0->u[3] & src1->u[3];
2243}
2244
2245static void
2246micro_or(union tgsi_exec_channel *dst,
2247         const union tgsi_exec_channel *src0,
2248         const union tgsi_exec_channel *src1)
2249{
2250   dst->u[0] = src0->u[0] | src1->u[0];
2251   dst->u[1] = src0->u[1] | src1->u[1];
2252   dst->u[2] = src0->u[2] | src1->u[2];
2253   dst->u[3] = src0->u[3] | src1->u[3];
2254}
2255
2256static void
2257micro_xor(union tgsi_exec_channel *dst,
2258          const union tgsi_exec_channel *src0,
2259          const union tgsi_exec_channel *src1)
2260{
2261   dst->u[0] = src0->u[0] ^ src1->u[0];
2262   dst->u[1] = src0->u[1] ^ src1->u[1];
2263   dst->u[2] = src0->u[2] ^ src1->u[2];
2264   dst->u[3] = src0->u[3] ^ src1->u[3];
2265}
2266
2267static void
2268micro_f2i(union tgsi_exec_channel *dst,
2269          const union tgsi_exec_channel *src)
2270{
2271   dst->i[0] = (int)src->f[0];
2272   dst->i[1] = (int)src->f[1];
2273   dst->i[2] = (int)src->f[2];
2274   dst->i[3] = (int)src->f[3];
2275}
2276
2277static void
2278micro_idiv(union tgsi_exec_channel *dst,
2279           const union tgsi_exec_channel *src0,
2280           const union tgsi_exec_channel *src1)
2281{
2282   dst->i[0] = src0->i[0] / src1->i[0];
2283   dst->i[1] = src0->i[1] / src1->i[1];
2284   dst->i[2] = src0->i[2] / src1->i[2];
2285   dst->i[3] = src0->i[3] / src1->i[3];
2286}
2287
2288static void
2289micro_imax(union tgsi_exec_channel *dst,
2290           const union tgsi_exec_channel *src0,
2291           const union tgsi_exec_channel *src1)
2292{
2293   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
2294   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
2295   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
2296   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
2297}
2298
2299static void
2300micro_imin(union tgsi_exec_channel *dst,
2301           const union tgsi_exec_channel *src0,
2302           const union tgsi_exec_channel *src1)
2303{
2304   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
2305   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
2306   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
2307   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
2308}
2309
2310static void
2311micro_isge(union tgsi_exec_channel *dst,
2312           const union tgsi_exec_channel *src0,
2313           const union tgsi_exec_channel *src1)
2314{
2315   dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
2316   dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
2317   dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
2318   dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
2319}
2320
2321static void
2322micro_ishr(union tgsi_exec_channel *dst,
2323           const union tgsi_exec_channel *src0,
2324           const union tgsi_exec_channel *src1)
2325{
2326   dst->i[0] = src0->i[0] >> src1->i[0];
2327   dst->i[1] = src0->i[1] >> src1->i[1];
2328   dst->i[2] = src0->i[2] >> src1->i[2];
2329   dst->i[3] = src0->i[3] >> src1->i[3];
2330}
2331
2332static void
2333micro_islt(union tgsi_exec_channel *dst,
2334           const union tgsi_exec_channel *src0,
2335           const union tgsi_exec_channel *src1)
2336{
2337   dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
2338   dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
2339   dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
2340   dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
2341}
2342
2343static void
2344micro_f2u(union tgsi_exec_channel *dst,
2345          const union tgsi_exec_channel *src)
2346{
2347   dst->u[0] = (uint)src->f[0];
2348   dst->u[1] = (uint)src->f[1];
2349   dst->u[2] = (uint)src->f[2];
2350   dst->u[3] = (uint)src->f[3];
2351}
2352
2353static void
2354micro_u2f(union tgsi_exec_channel *dst,
2355          const union tgsi_exec_channel *src)
2356{
2357   dst->f[0] = (float)src->u[0];
2358   dst->f[1] = (float)src->u[1];
2359   dst->f[2] = (float)src->u[2];
2360   dst->f[3] = (float)src->u[3];
2361}
2362
2363static void
2364micro_uadd(union tgsi_exec_channel *dst,
2365           const union tgsi_exec_channel *src0,
2366           const union tgsi_exec_channel *src1)
2367{
2368   dst->u[0] = src0->u[0] + src1->u[0];
2369   dst->u[1] = src0->u[1] + src1->u[1];
2370   dst->u[2] = src0->u[2] + src1->u[2];
2371   dst->u[3] = src0->u[3] + src1->u[3];
2372}
2373
2374static void
2375micro_udiv(union tgsi_exec_channel *dst,
2376           const union tgsi_exec_channel *src0,
2377           const union tgsi_exec_channel *src1)
2378{
2379   dst->u[0] = src0->u[0] / src1->u[0];
2380   dst->u[1] = src0->u[1] / src1->u[1];
2381   dst->u[2] = src0->u[2] / src1->u[2];
2382   dst->u[3] = src0->u[3] / src1->u[3];
2383}
2384
2385static void
2386micro_umad(union tgsi_exec_channel *dst,
2387           const union tgsi_exec_channel *src0,
2388           const union tgsi_exec_channel *src1,
2389           const union tgsi_exec_channel *src2)
2390{
2391   dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
2392   dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
2393   dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
2394   dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
2395}
2396
2397static void
2398micro_umax(union tgsi_exec_channel *dst,
2399           const union tgsi_exec_channel *src0,
2400           const union tgsi_exec_channel *src1)
2401{
2402   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
2403   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
2404   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
2405   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
2406}
2407
2408static void
2409micro_umin(union tgsi_exec_channel *dst,
2410           const union tgsi_exec_channel *src0,
2411           const union tgsi_exec_channel *src1)
2412{
2413   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
2414   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
2415   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
2416   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
2417}
2418
2419static void
2420micro_umod(union tgsi_exec_channel *dst,
2421           const union tgsi_exec_channel *src0,
2422           const union tgsi_exec_channel *src1)
2423{
2424   dst->u[0] = src0->u[0] % src1->u[0];
2425   dst->u[1] = src0->u[1] % src1->u[1];
2426   dst->u[2] = src0->u[2] % src1->u[2];
2427   dst->u[3] = src0->u[3] % src1->u[3];
2428}
2429
2430static void
2431micro_umul(union tgsi_exec_channel *dst,
2432           const union tgsi_exec_channel *src0,
2433           const union tgsi_exec_channel *src1)
2434{
2435   dst->u[0] = src0->u[0] * src1->u[0];
2436   dst->u[1] = src0->u[1] * src1->u[1];
2437   dst->u[2] = src0->u[2] * src1->u[2];
2438   dst->u[3] = src0->u[3] * src1->u[3];
2439}
2440
2441static void
2442micro_useq(union tgsi_exec_channel *dst,
2443           const union tgsi_exec_channel *src0,
2444           const union tgsi_exec_channel *src1)
2445{
2446   dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
2447   dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
2448   dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
2449   dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
2450}
2451
2452static void
2453micro_usge(union tgsi_exec_channel *dst,
2454           const union tgsi_exec_channel *src0,
2455           const union tgsi_exec_channel *src1)
2456{
2457   dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
2458   dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
2459   dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
2460   dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
2461}
2462
2463static void
2464micro_ushr(union tgsi_exec_channel *dst,
2465           const union tgsi_exec_channel *src0,
2466           const union tgsi_exec_channel *src1)
2467{
2468   dst->u[0] = src0->u[0] >> src1->u[0];
2469   dst->u[1] = src0->u[1] >> src1->u[1];
2470   dst->u[2] = src0->u[2] >> src1->u[2];
2471   dst->u[3] = src0->u[3] >> src1->u[3];
2472}
2473
2474static void
2475micro_uslt(union tgsi_exec_channel *dst,
2476           const union tgsi_exec_channel *src0,
2477           const union tgsi_exec_channel *src1)
2478{
2479   dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
2480   dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
2481   dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
2482   dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
2483}
2484
2485static void
2486micro_usne(union tgsi_exec_channel *dst,
2487           const union tgsi_exec_channel *src0,
2488           const union tgsi_exec_channel *src1)
2489{
2490   dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
2491   dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
2492   dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
2493   dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
2494}
2495
2496static void
2497exec_instruction(
2498   struct tgsi_exec_machine *mach,
2499   const struct tgsi_full_instruction *inst,
2500   int *pc )
2501{
2502   uint chan_index;
2503   union tgsi_exec_channel r[10];
2504   union tgsi_exec_channel d[8];
2505
2506   (*pc)++;
2507
2508   switch (inst->Instruction.Opcode) {
2509   case TGSI_OPCODE_ARL:
2510      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
2511      break;
2512
2513   case TGSI_OPCODE_MOV:
2514      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
2515      break;
2516
2517   case TGSI_OPCODE_LIT:
2518      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2519         FETCH( &r[0], 0, CHAN_X );
2520         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2521            micro_max(&d[CHAN_Y], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2522         }
2523
2524         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2525            FETCH( &r[1], 0, CHAN_Y );
2526            micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2527
2528            FETCH( &r[2], 0, CHAN_W );
2529            micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
2530            micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
2531            micro_pow( &r[1], &r[1], &r[2] );
2532            micro_lt(&d[CHAN_Z], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2533         }
2534
2535         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2536            STORE(&d[CHAN_Y], 0, CHAN_Y);
2537         }
2538         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2539            STORE(&d[CHAN_Z], 0, CHAN_Z);
2540         }
2541      }
2542      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2543         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2544      }
2545      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2546         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2547      }
2548      break;
2549
2550   case TGSI_OPCODE_RCP:
2551      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2552      break;
2553
2554   case TGSI_OPCODE_RSQ:
2555      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2556      break;
2557
2558   case TGSI_OPCODE_EXP:
2559      FETCH( &r[0], 0, CHAN_X );
2560      micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
2561      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2562         micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
2563         STORE( &r[2], 0, CHAN_X );        /* store r2 */
2564      }
2565      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2566         micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
2567         STORE( &r[2], 0, CHAN_Y );        /* store r2 */
2568      }
2569      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2570         micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
2571         STORE( &r[2], 0, CHAN_Z );        /* store r2 */
2572      }
2573      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2574         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2575      }
2576      break;
2577
2578   case TGSI_OPCODE_LOG:
2579      FETCH( &r[0], 0, CHAN_X );
2580      micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
2581      micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
2582      micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
2583      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2584         STORE( &r[0], 0, CHAN_X );
2585      }
2586      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2587         micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
2588         micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
2589         STORE( &r[0], 0, CHAN_Y );
2590      }
2591      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2592         STORE( &r[1], 0, CHAN_Z );
2593      }
2594      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2595         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2596      }
2597      break;
2598
2599   case TGSI_OPCODE_MUL:
2600      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2601         FETCH(&r[0], 0, chan_index);
2602         FETCH(&r[1], 1, chan_index);
2603         micro_mul(&d[chan_index], &r[0], &r[1]);
2604      }
2605      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2606         STORE(&d[chan_index], 0, chan_index);
2607      }
2608      break;
2609
2610   case TGSI_OPCODE_ADD:
2611      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2612         FETCH( &r[0], 0, chan_index );
2613         FETCH( &r[1], 1, chan_index );
2614         micro_add(&d[chan_index], &r[0], &r[1]);
2615      }
2616      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2617         STORE(&d[chan_index], 0, chan_index);
2618      }
2619      break;
2620
2621   case TGSI_OPCODE_DP3:
2622      exec_dp3(mach, inst);
2623      break;
2624
2625   case TGSI_OPCODE_DP4:
2626      exec_dp4(mach, inst);
2627      break;
2628
2629   case TGSI_OPCODE_DST:
2630      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2631         FETCH( &r[0], 0, CHAN_Y );
2632         FETCH( &r[1], 1, CHAN_Y);
2633         micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2634      }
2635      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2636         FETCH(&d[CHAN_Z], 0, CHAN_Z);
2637      }
2638      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2639         FETCH(&d[CHAN_W], 1, CHAN_W);
2640      }
2641
2642      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2643         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X);
2644      }
2645      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2646         STORE(&d[CHAN_Y], 0, CHAN_Y);
2647      }
2648      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2649         STORE(&d[CHAN_Z], 0, CHAN_Z);
2650      }
2651      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2652         STORE(&d[CHAN_W], 0, CHAN_W);
2653      }
2654      break;
2655
2656   case TGSI_OPCODE_MIN:
2657      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2658         FETCH(&r[0], 0, chan_index);
2659         FETCH(&r[1], 1, chan_index);
2660
2661         /* XXX use micro_min()?? */
2662         micro_lt(&d[chan_index], &r[0], &r[1], &r[0], &r[1]);
2663      }
2664      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2665         STORE(&d[chan_index], 0, chan_index);
2666      }
2667      break;
2668
2669   case TGSI_OPCODE_MAX:
2670      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2671         FETCH(&r[0], 0, chan_index);
2672         FETCH(&r[1], 1, chan_index);
2673
2674         /* XXX use micro_max()?? */
2675         micro_lt(&d[chan_index], &r[0], &r[1], &r[1], &r[0] );
2676      }
2677      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2678         STORE(&d[chan_index], 0, chan_index);
2679      }
2680      break;
2681
2682   case TGSI_OPCODE_SLT:
2683      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2684      break;
2685
2686   case TGSI_OPCODE_SGE:
2687      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2688      break;
2689
2690   case TGSI_OPCODE_MAD:
2691      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2692      break;
2693
2694   case TGSI_OPCODE_SUB:
2695      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2696         FETCH(&r[0], 0, chan_index);
2697         FETCH(&r[1], 1, chan_index);
2698         micro_sub(&d[chan_index], &r[0], &r[1]);
2699      }
2700      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2701         STORE(&d[chan_index], 0, chan_index);
2702      }
2703      break;
2704
2705   case TGSI_OPCODE_LRP:
2706      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2707      break;
2708
2709   case TGSI_OPCODE_CND:
2710      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2711         FETCH(&r[0], 0, chan_index);
2712         FETCH(&r[1], 1, chan_index);
2713         FETCH(&r[2], 2, chan_index);
2714         micro_lt(&d[chan_index], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]);
2715      }
2716      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2717         STORE(&d[chan_index], 0, chan_index);
2718      }
2719      break;
2720
2721   case TGSI_OPCODE_DP2A:
2722      exec_dp2a(mach, inst);
2723      break;
2724
2725   case TGSI_OPCODE_FRC:
2726      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2727      break;
2728
2729   case TGSI_OPCODE_CLAMP:
2730      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2731         FETCH(&r[0], 0, chan_index);
2732         FETCH(&r[1], 1, chan_index);
2733         micro_max(&r[0], &r[0], &r[1]);
2734         FETCH(&r[1], 2, chan_index);
2735         micro_min(&d[chan_index], &r[0], &r[1]);
2736      }
2737      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2738         STORE(&d[chan_index], 0, chan_index);
2739      }
2740      break;
2741
2742   case TGSI_OPCODE_FLR:
2743      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2744      break;
2745
2746   case TGSI_OPCODE_ROUND:
2747      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2748      break;
2749
2750   case TGSI_OPCODE_EX2:
2751      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2752      break;
2753
2754   case TGSI_OPCODE_LG2:
2755      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2756      break;
2757
2758   case TGSI_OPCODE_POW:
2759      FETCH(&r[0], 0, CHAN_X);
2760      FETCH(&r[1], 1, CHAN_X);
2761
2762      micro_pow( &r[0], &r[0], &r[1] );
2763
2764      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2765         STORE( &r[0], 0, chan_index );
2766      }
2767      break;
2768
2769   case TGSI_OPCODE_XPD:
2770      FETCH(&r[0], 0, CHAN_Y);
2771      FETCH(&r[1], 1, CHAN_Z);
2772
2773      micro_mul( &r[2], &r[0], &r[1] );
2774
2775      FETCH(&r[3], 0, CHAN_Z);
2776      FETCH(&r[4], 1, CHAN_Y);
2777
2778      micro_mul( &r[5], &r[3], &r[4] );
2779      micro_sub(&d[CHAN_X], &r[2], &r[5]);
2780
2781      FETCH(&r[2], 1, CHAN_X);
2782
2783      micro_mul( &r[3], &r[3], &r[2] );
2784
2785      FETCH(&r[5], 0, CHAN_X);
2786
2787      micro_mul( &r[1], &r[1], &r[5] );
2788      micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2789
2790      micro_mul( &r[5], &r[5], &r[4] );
2791      micro_mul( &r[0], &r[0], &r[2] );
2792      micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2793
2794      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2795         STORE(&d[CHAN_X], 0, CHAN_X);
2796      }
2797      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2798         STORE(&d[CHAN_Y], 0, CHAN_Y);
2799      }
2800      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2801         STORE(&d[CHAN_Z], 0, CHAN_Z);
2802      }
2803      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2804         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2805      }
2806      break;
2807
2808   case TGSI_OPCODE_ABS:
2809      exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2810      break;
2811
2812   case TGSI_OPCODE_RCC:
2813      FETCH(&r[0], 0, CHAN_X);
2814      micro_div(&r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0]);
2815      micro_float_clamp(&r[0], &r[0]);
2816      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2817         STORE(&r[0], 0, chan_index);
2818      }
2819      break;
2820
2821   case TGSI_OPCODE_DPH:
2822      exec_dph(mach, inst);
2823      break;
2824
2825   case TGSI_OPCODE_COS:
2826      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2827      break;
2828
2829   case TGSI_OPCODE_DDX:
2830      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2831      break;
2832
2833   case TGSI_OPCODE_DDY:
2834      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2835      break;
2836
2837   case TGSI_OPCODE_KILP:
2838      exec_kilp (mach, inst);
2839      break;
2840
2841   case TGSI_OPCODE_KIL:
2842      exec_kil (mach, inst);
2843      break;
2844
2845   case TGSI_OPCODE_PK2H:
2846      assert (0);
2847      break;
2848
2849   case TGSI_OPCODE_PK2US:
2850      assert (0);
2851      break;
2852
2853   case TGSI_OPCODE_PK4B:
2854      assert (0);
2855      break;
2856
2857   case TGSI_OPCODE_PK4UB:
2858      assert (0);
2859      break;
2860
2861   case TGSI_OPCODE_RFL:
2862      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2863          IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2864          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2865         /* r0 = dp3(src0, src0) */
2866         FETCH(&r[2], 0, CHAN_X);
2867         micro_mul(&r[0], &r[2], &r[2]);
2868         FETCH(&r[4], 0, CHAN_Y);
2869         micro_mul(&r[8], &r[4], &r[4]);
2870         micro_add(&r[0], &r[0], &r[8]);
2871         FETCH(&r[6], 0, CHAN_Z);
2872         micro_mul(&r[8], &r[6], &r[6]);
2873         micro_add(&r[0], &r[0], &r[8]);
2874
2875         /* r1 = dp3(src0, src1) */
2876         FETCH(&r[3], 1, CHAN_X);
2877         micro_mul(&r[1], &r[2], &r[3]);
2878         FETCH(&r[5], 1, CHAN_Y);
2879         micro_mul(&r[8], &r[4], &r[5]);
2880         micro_add(&r[1], &r[1], &r[8]);
2881         FETCH(&r[7], 1, CHAN_Z);
2882         micro_mul(&r[8], &r[6], &r[7]);
2883         micro_add(&r[1], &r[1], &r[8]);
2884
2885         /* r1 = 2 * r1 / r0 */
2886         micro_add(&r[1], &r[1], &r[1]);
2887         micro_div(&r[1], &r[1], &r[0]);
2888
2889         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2890            micro_mul(&r[2], &r[2], &r[1]);
2891            micro_sub(&r[2], &r[2], &r[3]);
2892            STORE(&r[2], 0, CHAN_X);
2893         }
2894         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2895            micro_mul(&r[4], &r[4], &r[1]);
2896            micro_sub(&r[4], &r[4], &r[5]);
2897            STORE(&r[4], 0, CHAN_Y);
2898         }
2899         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2900            micro_mul(&r[6], &r[6], &r[1]);
2901            micro_sub(&r[6], &r[6], &r[7]);
2902            STORE(&r[6], 0, CHAN_Z);
2903         }
2904      }
2905      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2906         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2907      }
2908      break;
2909
2910   case TGSI_OPCODE_SEQ:
2911      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2912      break;
2913
2914   case TGSI_OPCODE_SFL:
2915      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2916         STORE(&mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, chan_index);
2917      }
2918      break;
2919
2920   case TGSI_OPCODE_SGT:
2921      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2922      break;
2923
2924   case TGSI_OPCODE_SIN:
2925      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2926      break;
2927
2928   case TGSI_OPCODE_SLE:
2929      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2930      break;
2931
2932   case TGSI_OPCODE_SNE:
2933      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2934      break;
2935
2936   case TGSI_OPCODE_STR:
2937      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2938         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, chan_index);
2939      }
2940      break;
2941
2942   case TGSI_OPCODE_TEX:
2943      /* simple texture lookup */
2944      /* src[0] = texcoord */
2945      /* src[1] = sampler unit */
2946      exec_tex(mach, inst, TEX_MODIFIER_NONE);
2947      break;
2948
2949   case TGSI_OPCODE_TXB:
2950      /* Texture lookup with lod bias */
2951      /* src[0] = texcoord (src[0].w = LOD bias) */
2952      /* src[1] = sampler unit */
2953      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
2954      break;
2955
2956   case TGSI_OPCODE_TXD:
2957      /* Texture lookup with explict partial derivatives */
2958      /* src[0] = texcoord */
2959      /* src[1] = d[strq]/dx */
2960      /* src[2] = d[strq]/dy */
2961      /* src[3] = sampler unit */
2962      exec_txd(mach, inst);
2963      break;
2964
2965   case TGSI_OPCODE_TXL:
2966      /* Texture lookup with explit LOD */
2967      /* src[0] = texcoord (src[0].w = LOD) */
2968      /* src[1] = sampler unit */
2969      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
2970      break;
2971
2972   case TGSI_OPCODE_TXP:
2973      /* Texture lookup with projection */
2974      /* src[0] = texcoord (src[0].w = projection) */
2975      /* src[1] = sampler unit */
2976      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
2977      break;
2978
2979   case TGSI_OPCODE_UP2H:
2980      assert (0);
2981      break;
2982
2983   case TGSI_OPCODE_UP2US:
2984      assert (0);
2985      break;
2986
2987   case TGSI_OPCODE_UP4B:
2988      assert (0);
2989      break;
2990
2991   case TGSI_OPCODE_UP4UB:
2992      assert (0);
2993      break;
2994
2995   case TGSI_OPCODE_X2D:
2996      FETCH(&r[0], 1, CHAN_X);
2997      FETCH(&r[1], 1, CHAN_Y);
2998      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2999          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
3000         FETCH(&r[2], 2, CHAN_X);
3001         micro_mul(&r[2], &r[2], &r[0]);
3002         FETCH(&r[3], 2, CHAN_Y);
3003         micro_mul(&r[3], &r[3], &r[1]);
3004         micro_add(&r[2], &r[2], &r[3]);
3005         FETCH(&r[3], 0, CHAN_X);
3006         micro_add(&d[CHAN_X], &r[2], &r[3]);
3007
3008      }
3009      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
3010          IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
3011         FETCH(&r[2], 2, CHAN_Z);
3012         micro_mul(&r[2], &r[2], &r[0]);
3013         FETCH(&r[3], 2, CHAN_W);
3014         micro_mul(&r[3], &r[3], &r[1]);
3015         micro_add(&r[2], &r[2], &r[3]);
3016         FETCH(&r[3], 0, CHAN_Y);
3017         micro_add(&d[CHAN_Y], &r[2], &r[3]);
3018
3019      }
3020      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
3021         STORE(&d[CHAN_X], 0, CHAN_X);
3022      }
3023      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
3024         STORE(&d[CHAN_Y], 0, CHAN_Y);
3025      }
3026      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
3027         STORE(&d[CHAN_X], 0, CHAN_Z);
3028      }
3029      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
3030         STORE(&d[CHAN_Y], 0, CHAN_W);
3031      }
3032      break;
3033
3034   case TGSI_OPCODE_ARA:
3035      assert (0);
3036      break;
3037
3038   case TGSI_OPCODE_ARR:
3039      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3040      break;
3041
3042   case TGSI_OPCODE_BRA:
3043      assert (0);
3044      break;
3045
3046   case TGSI_OPCODE_CAL:
3047      /* skip the call if no execution channels are enabled */
3048      if (mach->ExecMask) {
3049         /* do the call */
3050
3051         /* First, record the depths of the execution stacks.
3052          * This is important for deeply nested/looped return statements.
3053          * We have to unwind the stacks by the correct amount.  For a
3054          * real code generator, we could determine the number of entries
3055          * to pop off each stack with simple static analysis and avoid
3056          * implementing this data structure at run time.
3057          */
3058         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
3059         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
3060         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
3061         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
3062         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
3063         /* note that PC was already incremented above */
3064         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
3065
3066         mach->CallStackTop++;
3067
3068         /* Second, push the Cond, Loop, Cont, Func stacks */
3069         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3070         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3071         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3072         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3073         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3074         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
3075
3076         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3077         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3078         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3079         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3080         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3081         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
3082
3083         /* Finally, jump to the subroutine */
3084         *pc = inst->Label.Label;
3085      }
3086      break;
3087
3088   case TGSI_OPCODE_RET:
3089      mach->FuncMask &= ~mach->ExecMask;
3090      UPDATE_EXEC_MASK(mach);
3091
3092      if (mach->FuncMask == 0x0) {
3093         /* really return now (otherwise, keep executing */
3094
3095         if (mach->CallStackTop == 0) {
3096            /* returning from main() */
3097            *pc = -1;
3098            return;
3099         }
3100
3101         assert(mach->CallStackTop > 0);
3102         mach->CallStackTop--;
3103
3104         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3105         mach->CondMask = mach->CondStack[mach->CondStackTop];
3106
3107         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3108         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3109
3110         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3111         mach->ContMask = mach->ContStack[mach->ContStackTop];
3112
3113         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3114         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3115
3116         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3117         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3118
3119         assert(mach->FuncStackTop > 0);
3120         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3121
3122         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3123
3124         UPDATE_EXEC_MASK(mach);
3125      }
3126      break;
3127
3128   case TGSI_OPCODE_SSG:
3129      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3130      break;
3131
3132   case TGSI_OPCODE_CMP:
3133      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3134         FETCH(&r[0], 0, chan_index);
3135         FETCH(&r[1], 1, chan_index);
3136         FETCH(&r[2], 2, chan_index);
3137         micro_lt(&d[chan_index], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2]);
3138      }
3139      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3140         STORE(&d[chan_index], 0, chan_index);
3141      }
3142      break;
3143
3144   case TGSI_OPCODE_SCS:
3145      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
3146         FETCH( &r[0], 0, CHAN_X );
3147         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
3148            micro_cos(&r[1], &r[0]);
3149            STORE(&r[1], 0, CHAN_X);
3150         }
3151         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
3152            micro_sin(&r[1], &r[0]);
3153            STORE(&r[1], 0, CHAN_Y);
3154         }
3155      }
3156      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
3157         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
3158      }
3159      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
3160         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
3161      }
3162      break;
3163
3164   case TGSI_OPCODE_NRM:
3165      exec_nrm3(mach, inst);
3166      break;
3167
3168   case TGSI_OPCODE_NRM4:
3169      exec_nrm4(mach, inst);
3170      break;
3171
3172   case TGSI_OPCODE_DIV:
3173      assert( 0 );
3174      break;
3175
3176   case TGSI_OPCODE_DP2:
3177      exec_dp2(mach, inst);
3178      break;
3179
3180   case TGSI_OPCODE_IF:
3181      /* push CondMask */
3182      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3183      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3184      FETCH( &r[0], 0, CHAN_X );
3185      /* update CondMask */
3186      if( ! r[0].u[0] ) {
3187         mach->CondMask &= ~0x1;
3188      }
3189      if( ! r[0].u[1] ) {
3190         mach->CondMask &= ~0x2;
3191      }
3192      if( ! r[0].u[2] ) {
3193         mach->CondMask &= ~0x4;
3194      }
3195      if( ! r[0].u[3] ) {
3196         mach->CondMask &= ~0x8;
3197      }
3198      UPDATE_EXEC_MASK(mach);
3199      /* Todo: If CondMask==0, jump to ELSE */
3200      break;
3201
3202   case TGSI_OPCODE_ELSE:
3203      /* invert CondMask wrt previous mask */
3204      {
3205         uint prevMask;
3206         assert(mach->CondStackTop > 0);
3207         prevMask = mach->CondStack[mach->CondStackTop - 1];
3208         mach->CondMask = ~mach->CondMask & prevMask;
3209         UPDATE_EXEC_MASK(mach);
3210         /* Todo: If CondMask==0, jump to ENDIF */
3211      }
3212      break;
3213
3214   case TGSI_OPCODE_ENDIF:
3215      /* pop CondMask */
3216      assert(mach->CondStackTop > 0);
3217      mach->CondMask = mach->CondStack[--mach->CondStackTop];
3218      UPDATE_EXEC_MASK(mach);
3219      break;
3220
3221   case TGSI_OPCODE_END:
3222      /* halt execution */
3223      *pc = -1;
3224      break;
3225
3226   case TGSI_OPCODE_REP:
3227      assert (0);
3228      break;
3229
3230   case TGSI_OPCODE_ENDREP:
3231       assert (0);
3232       break;
3233
3234   case TGSI_OPCODE_PUSHA:
3235      assert (0);
3236      break;
3237
3238   case TGSI_OPCODE_POPA:
3239      assert (0);
3240      break;
3241
3242   case TGSI_OPCODE_CEIL:
3243      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3244      break;
3245
3246   case TGSI_OPCODE_I2F:
3247      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
3248      break;
3249
3250   case TGSI_OPCODE_NOT:
3251      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3252      break;
3253
3254   case TGSI_OPCODE_TRUNC:
3255      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3256      break;
3257
3258   case TGSI_OPCODE_SHL:
3259      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3260      break;
3261
3262   case TGSI_OPCODE_AND:
3263      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3264      break;
3265
3266   case TGSI_OPCODE_OR:
3267      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3268      break;
3269
3270   case TGSI_OPCODE_MOD:
3271      assert (0);
3272      break;
3273
3274   case TGSI_OPCODE_XOR:
3275      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3276      break;
3277
3278   case TGSI_OPCODE_SAD:
3279      assert (0);
3280      break;
3281
3282   case TGSI_OPCODE_TXF:
3283      assert (0);
3284      break;
3285
3286   case TGSI_OPCODE_TXQ:
3287      assert (0);
3288      break;
3289
3290   case TGSI_OPCODE_EMIT:
3291      emit_vertex(mach);
3292      break;
3293
3294   case TGSI_OPCODE_ENDPRIM:
3295      emit_primitive(mach);
3296      break;
3297
3298   case TGSI_OPCODE_BGNFOR:
3299      assert(mach->LoopCounterStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3300      for (chan_index = 0; chan_index < 3; chan_index++) {
3301         FETCH( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[chan_index], 0, chan_index );
3302      }
3303      ++mach->LoopCounterStackTop;
3304      STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X], 0, CHAN_X);
3305      /* update LoopMask */
3306      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3307         mach->LoopMask &= ~0x1;
3308      }
3309      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3310         mach->LoopMask &= ~0x2;
3311      }
3312      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3313         mach->LoopMask &= ~0x4;
3314      }
3315      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3316         mach->LoopMask &= ~0x8;
3317      }
3318      /* TODO: if mach->LoopMask == 0, jump to end of loop */
3319      UPDATE_EXEC_MASK(mach);
3320      /* fall-through (for now) */
3321   case TGSI_OPCODE_BGNLOOP:
3322      /* push LoopMask and ContMasks */
3323      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3324      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3325      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3326      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3327
3328      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3329      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3330      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3331      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3332      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
3333      break;
3334
3335   case TGSI_OPCODE_ENDFOR:
3336      assert(mach->LoopCounterStackTop > 0);
3337      micro_sub(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3338                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3339                &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
3340      /* update LoopMask */
3341      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3342         mach->LoopMask &= ~0x1;
3343      }
3344      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3345         mach->LoopMask &= ~0x2;
3346      }
3347      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3348         mach->LoopMask &= ~0x4;
3349      }
3350      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3351         mach->LoopMask &= ~0x8;
3352      }
3353      micro_add(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3354                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3355                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Z]);
3356      assert(mach->LoopLabelStackTop > 0);
3357      inst = mach->Instructions + mach->LoopLabelStack[mach->LoopLabelStackTop - 1];
3358      STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_X], 0, CHAN_X);
3359      /* Restore ContMask, but don't pop */
3360      assert(mach->ContStackTop > 0);
3361      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3362      UPDATE_EXEC_MASK(mach);
3363      if (mach->ExecMask) {
3364         /* repeat loop: jump to instruction just past BGNLOOP */
3365         assert(mach->LoopLabelStackTop > 0);
3366         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3367      }
3368      else {
3369         /* exit loop: pop LoopMask */
3370         assert(mach->LoopStackTop > 0);
3371         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3372         /* pop ContMask */
3373         assert(mach->ContStackTop > 0);
3374         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3375         assert(mach->LoopLabelStackTop > 0);
3376         --mach->LoopLabelStackTop;
3377         assert(mach->LoopCounterStackTop > 0);
3378         --mach->LoopCounterStackTop;
3379
3380         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3381      }
3382      UPDATE_EXEC_MASK(mach);
3383      break;
3384
3385   case TGSI_OPCODE_ENDLOOP:
3386      /* Restore ContMask, but don't pop */
3387      assert(mach->ContStackTop > 0);
3388      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3389      UPDATE_EXEC_MASK(mach);
3390      if (mach->ExecMask) {
3391         /* repeat loop: jump to instruction just past BGNLOOP */
3392         assert(mach->LoopLabelStackTop > 0);
3393         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3394      }
3395      else {
3396         /* exit loop: pop LoopMask */
3397         assert(mach->LoopStackTop > 0);
3398         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3399         /* pop ContMask */
3400         assert(mach->ContStackTop > 0);
3401         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3402         assert(mach->LoopLabelStackTop > 0);
3403         --mach->LoopLabelStackTop;
3404
3405         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3406      }
3407      UPDATE_EXEC_MASK(mach);
3408      break;
3409
3410   case TGSI_OPCODE_BRK:
3411      exec_break(mach);
3412      break;
3413
3414   case TGSI_OPCODE_CONT:
3415      /* turn off cont channels for each enabled exec channel */
3416      mach->ContMask &= ~mach->ExecMask;
3417      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3418      UPDATE_EXEC_MASK(mach);
3419      break;
3420
3421   case TGSI_OPCODE_BGNSUB:
3422      /* no-op */
3423      break;
3424
3425   case TGSI_OPCODE_ENDSUB:
3426      /*
3427       * XXX: This really should be a no-op. We should never reach this opcode.
3428       */
3429
3430      assert(mach->CallStackTop > 0);
3431      mach->CallStackTop--;
3432
3433      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3434      mach->CondMask = mach->CondStack[mach->CondStackTop];
3435
3436      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3437      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3438
3439      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3440      mach->ContMask = mach->ContStack[mach->ContStackTop];
3441
3442      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3443      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3444
3445      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3446      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3447
3448      assert(mach->FuncStackTop > 0);
3449      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3450
3451      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3452
3453      UPDATE_EXEC_MASK(mach);
3454      break;
3455
3456   case TGSI_OPCODE_NOP:
3457      break;
3458
3459   case TGSI_OPCODE_BREAKC:
3460      FETCH(&r[0], 0, CHAN_X);
3461      /* update CondMask */
3462      if (r[0].u[0] && (mach->ExecMask & 0x1)) {
3463         mach->LoopMask &= ~0x1;
3464      }
3465      if (r[0].u[1] && (mach->ExecMask & 0x2)) {
3466         mach->LoopMask &= ~0x2;
3467      }
3468      if (r[0].u[2] && (mach->ExecMask & 0x4)) {
3469         mach->LoopMask &= ~0x4;
3470      }
3471      if (r[0].u[3] && (mach->ExecMask & 0x8)) {
3472         mach->LoopMask &= ~0x8;
3473      }
3474      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3475      UPDATE_EXEC_MASK(mach);
3476      break;
3477
3478   case TGSI_OPCODE_F2I:
3479      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3480      break;
3481
3482   case TGSI_OPCODE_IDIV:
3483      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3484      break;
3485
3486   case TGSI_OPCODE_IMAX:
3487      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3488      break;
3489
3490   case TGSI_OPCODE_IMIN:
3491      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3492      break;
3493
3494   case TGSI_OPCODE_INEG:
3495      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3496      break;
3497
3498   case TGSI_OPCODE_ISGE:
3499      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3500      break;
3501
3502   case TGSI_OPCODE_ISHR:
3503      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3504      break;
3505
3506   case TGSI_OPCODE_ISLT:
3507      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3508      break;
3509
3510   case TGSI_OPCODE_F2U:
3511      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3512      break;
3513
3514   case TGSI_OPCODE_U2F:
3515      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
3516      break;
3517
3518   case TGSI_OPCODE_UADD:
3519      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3520      break;
3521
3522   case TGSI_OPCODE_UDIV:
3523      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3524      break;
3525
3526   case TGSI_OPCODE_UMAD:
3527      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3528      break;
3529
3530   case TGSI_OPCODE_UMAX:
3531      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3532      break;
3533
3534   case TGSI_OPCODE_UMIN:
3535      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3536      break;
3537
3538   case TGSI_OPCODE_UMOD:
3539      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3540      break;
3541
3542   case TGSI_OPCODE_UMUL:
3543      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3544      break;
3545
3546   case TGSI_OPCODE_USEQ:
3547      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3548      break;
3549
3550   case TGSI_OPCODE_USGE:
3551      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3552      break;
3553
3554   case TGSI_OPCODE_USHR:
3555      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3556      break;
3557
3558   case TGSI_OPCODE_USLT:
3559      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3560      break;
3561
3562   case TGSI_OPCODE_USNE:
3563      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3564      break;
3565
3566   case TGSI_OPCODE_SWITCH:
3567      exec_switch(mach, inst);
3568      break;
3569
3570   case TGSI_OPCODE_CASE:
3571      exec_case(mach, inst);
3572      break;
3573
3574   case TGSI_OPCODE_DEFAULT:
3575      exec_default(mach);
3576      break;
3577
3578   case TGSI_OPCODE_ENDSWITCH:
3579      exec_endswitch(mach);
3580      break;
3581
3582   default:
3583      assert( 0 );
3584   }
3585}
3586
3587
3588#define DEBUG_EXECUTION 0
3589
3590
3591/**
3592 * Run TGSI interpreter.
3593 * \return bitmask of "alive" quad components
3594 */
3595uint
3596tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
3597{
3598   uint i;
3599   int pc = 0;
3600
3601   mach->CondMask = 0xf;
3602   mach->LoopMask = 0xf;
3603   mach->ContMask = 0xf;
3604   mach->FuncMask = 0xf;
3605   mach->ExecMask = 0xf;
3606
3607   mach->Switch.mask = 0xf;
3608
3609   assert(mach->CondStackTop == 0);
3610   assert(mach->LoopStackTop == 0);
3611   assert(mach->ContStackTop == 0);
3612   assert(mach->SwitchStackTop == 0);
3613   assert(mach->BreakStackTop == 0);
3614   assert(mach->CallStackTop == 0);
3615
3616   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
3617   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
3618
3619   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
3620      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
3621      mach->Primitives[0] = 0;
3622   }
3623
3624   for (i = 0; i < QUAD_SIZE; i++) {
3625      mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
3626         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
3627         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
3628         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
3629         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
3630   }
3631
3632   /* execute declarations (interpolants) */
3633   for (i = 0; i < mach->NumDeclarations; i++) {
3634      exec_declaration( mach, mach->Declarations+i );
3635   }
3636
3637   {
3638#if DEBUG_EXECUTION
3639      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
3640      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
3641      uint inst = 1;
3642
3643      memcpy(temps, mach->Temps, sizeof(temps));
3644      memcpy(outputs, mach->Outputs, sizeof(outputs));
3645#endif
3646
3647      /* execute instructions, until pc is set to -1 */
3648      while (pc != -1) {
3649
3650#if DEBUG_EXECUTION
3651         uint i;
3652
3653         tgsi_dump_instruction(&mach->Instructions[pc], inst++);
3654#endif
3655
3656         assert(pc < (int) mach->NumInstructions);
3657         exec_instruction(mach, mach->Instructions + pc, &pc);
3658
3659#if DEBUG_EXECUTION
3660         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
3661            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
3662               uint j;
3663
3664               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
3665               debug_printf("TEMP[%2u] = ", i);
3666               for (j = 0; j < 4; j++) {
3667                  if (j > 0) {
3668                     debug_printf("           ");
3669                  }
3670                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3671                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
3672                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
3673                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
3674                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
3675               }
3676            }
3677         }
3678         for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
3679            if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
3680               uint j;
3681
3682               memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
3683               debug_printf("OUT[%2u] =  ", i);
3684               for (j = 0; j < 4; j++) {
3685                  if (j > 0) {
3686                     debug_printf("           ");
3687                  }
3688                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3689                               outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
3690                               outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
3691                               outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
3692                               outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
3693               }
3694            }
3695         }
3696#endif
3697      }
3698   }
3699
3700#if 0
3701   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
3702   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
3703      /*
3704       * Scale back depth component.
3705       */
3706      for (i = 0; i < 4; i++)
3707         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
3708   }
3709#endif
3710
3711   assert(mach->CondStackTop == 0);
3712   assert(mach->LoopStackTop == 0);
3713   assert(mach->ContStackTop == 0);
3714   assert(mach->SwitchStackTop == 0);
3715   assert(mach->BreakStackTop == 0);
3716   assert(mach->CallStackTop == 0);
3717
3718   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3719}
3720