tgsi_exec.c revision d68f024b7dd1891d4939bf56d3065acc225b9c81
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29/**
30 * TGSI interpreter/executor.
31 *
32 * Flow control information:
33 *
34 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
35 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
36 * care since a condition may be true for some quad components but false
37 * for other components.
38 *
39 * We basically execute all statements (even if they're in the part of
40 * an IF/ELSE clause that's "not taken") and use a special mask to
41 * control writing to destination registers.  This is the ExecMask.
42 * See store_dest().
43 *
44 * The ExecMask is computed from three other masks (CondMask, LoopMask and
45 * ContMask) which are controlled by the flow control instructions (namely:
46 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
47 *
48 *
49 * Authors:
50 *   Michal Krol
51 *   Brian Paul
52 */
53
54#include "pipe/p_compiler.h"
55#include "pipe/p_state.h"
56#include "pipe/p_shader_tokens.h"
57#include "tgsi/tgsi_dump.h"
58#include "tgsi/tgsi_parse.h"
59#include "tgsi/tgsi_util.h"
60#include "tgsi_exec.h"
61#include "util/u_memory.h"
62#include "util/u_math.h"
63
64
65#define FAST_MATH 1
66
67#define TILE_TOP_LEFT     0
68#define TILE_TOP_RIGHT    1
69#define TILE_BOTTOM_LEFT  2
70#define TILE_BOTTOM_RIGHT 3
71
72static void
73micro_abs(union tgsi_exec_channel *dst,
74          const union tgsi_exec_channel *src)
75{
76   dst->f[0] = fabsf(src->f[0]);
77   dst->f[1] = fabsf(src->f[1]);
78   dst->f[2] = fabsf(src->f[2]);
79   dst->f[3] = fabsf(src->f[3]);
80}
81
82static void
83micro_arl(union tgsi_exec_channel *dst,
84          const union tgsi_exec_channel *src)
85{
86   dst->i[0] = (int)floorf(src->f[0]);
87   dst->i[1] = (int)floorf(src->f[1]);
88   dst->i[2] = (int)floorf(src->f[2]);
89   dst->i[3] = (int)floorf(src->f[3]);
90}
91
92static void
93micro_arr(union tgsi_exec_channel *dst,
94          const union tgsi_exec_channel *src)
95{
96   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
97   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
98   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
99   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
100}
101
102static void
103micro_ceil(union tgsi_exec_channel *dst,
104           const union tgsi_exec_channel *src)
105{
106   dst->f[0] = ceilf(src->f[0]);
107   dst->f[1] = ceilf(src->f[1]);
108   dst->f[2] = ceilf(src->f[2]);
109   dst->f[3] = ceilf(src->f[3]);
110}
111
112static void
113micro_cos(union tgsi_exec_channel *dst,
114          const union tgsi_exec_channel *src)
115{
116   dst->f[0] = cosf(src->f[0]);
117   dst->f[1] = cosf(src->f[1]);
118   dst->f[2] = cosf(src->f[2]);
119   dst->f[3] = cosf(src->f[3]);
120}
121
122static void
123micro_ddx(union tgsi_exec_channel *dst,
124          const union tgsi_exec_channel *src)
125{
126   dst->f[0] =
127   dst->f[1] =
128   dst->f[2] =
129   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
130}
131
132static void
133micro_ddy(union tgsi_exec_channel *dst,
134          const union tgsi_exec_channel *src)
135{
136   dst->f[0] =
137   dst->f[1] =
138   dst->f[2] =
139   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
140}
141
142static void
143micro_exp2(union tgsi_exec_channel *dst,
144           const union tgsi_exec_channel *src)
145{
146#if FAST_MATH
147   dst->f[0] = util_fast_exp2(src->f[0]);
148   dst->f[1] = util_fast_exp2(src->f[1]);
149   dst->f[2] = util_fast_exp2(src->f[2]);
150   dst->f[3] = util_fast_exp2(src->f[3]);
151#else
152#if DEBUG
153   /* Inf is okay for this instruction, so clamp it to silence assertions. */
154   uint i;
155   union tgsi_exec_channel clamped;
156
157   for (i = 0; i < 4; i++) {
158      if (src->f[i] > 127.99999f) {
159         clamped.f[i] = 127.99999f;
160      } else if (src->f[i] < -126.99999f) {
161         clamped.f[i] = -126.99999f;
162      } else {
163         clamped.f[i] = src->f[i];
164      }
165   }
166   src = &clamped;
167#endif /* DEBUG */
168
169   dst->f[0] = powf(2.0f, src->f[0]);
170   dst->f[1] = powf(2.0f, src->f[1]);
171   dst->f[2] = powf(2.0f, src->f[2]);
172   dst->f[3] = powf(2.0f, src->f[3]);
173#endif /* FAST_MATH */
174}
175
176static void
177micro_flr(union tgsi_exec_channel *dst,
178          const union tgsi_exec_channel *src)
179{
180   dst->f[0] = floorf(src->f[0]);
181   dst->f[1] = floorf(src->f[1]);
182   dst->f[2] = floorf(src->f[2]);
183   dst->f[3] = floorf(src->f[3]);
184}
185
186static void
187micro_frc(union tgsi_exec_channel *dst,
188          const union tgsi_exec_channel *src)
189{
190   dst->f[0] = src->f[0] - floorf(src->f[0]);
191   dst->f[1] = src->f[1] - floorf(src->f[1]);
192   dst->f[2] = src->f[2] - floorf(src->f[2]);
193   dst->f[3] = src->f[3] - floorf(src->f[3]);
194}
195
196static void
197micro_iabs(union tgsi_exec_channel *dst,
198           const union tgsi_exec_channel *src)
199{
200   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
201   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
202   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
203   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
204}
205
206static void
207micro_ineg(union tgsi_exec_channel *dst,
208           const union tgsi_exec_channel *src)
209{
210   dst->i[0] = -src->i[0];
211   dst->i[1] = -src->i[1];
212   dst->i[2] = -src->i[2];
213   dst->i[3] = -src->i[3];
214}
215
216static void
217micro_lg2(union tgsi_exec_channel *dst,
218          const union tgsi_exec_channel *src)
219{
220#if FAST_MATH
221   dst->f[0] = util_fast_log2(src->f[0]);
222   dst->f[1] = util_fast_log2(src->f[1]);
223   dst->f[2] = util_fast_log2(src->f[2]);
224   dst->f[3] = util_fast_log2(src->f[3]);
225#else
226   dst->f[0] = logf(src->f[0]) * 1.442695f;
227   dst->f[1] = logf(src->f[1]) * 1.442695f;
228   dst->f[2] = logf(src->f[2]) * 1.442695f;
229   dst->f[3] = logf(src->f[3]) * 1.442695f;
230#endif
231}
232
233static void
234micro_lrp(union tgsi_exec_channel *dst,
235          const union tgsi_exec_channel *src)
236{
237   dst->f[0] = src[0].f[0] * (src[1].f[0] - src[2].f[0]) + src[2].f[0];
238   dst->f[1] = src[0].f[1] * (src[1].f[1] - src[2].f[1]) + src[2].f[1];
239   dst->f[2] = src[0].f[2] * (src[1].f[2] - src[2].f[2]) + src[2].f[2];
240   dst->f[3] = src[0].f[3] * (src[1].f[3] - src[2].f[3]) + src[2].f[3];
241}
242
243static void
244micro_mad(union tgsi_exec_channel *dst,
245          const union tgsi_exec_channel *src)
246{
247   dst->f[0] = src[0].f[0] * src[1].f[0] + src[2].f[0];
248   dst->f[1] = src[0].f[1] * src[1].f[1] + src[2].f[1];
249   dst->f[2] = src[0].f[2] * src[1].f[2] + src[2].f[2];
250   dst->f[3] = src[0].f[3] * src[1].f[3] + src[2].f[3];
251}
252
253static void
254micro_mov(union tgsi_exec_channel *dst,
255          const union tgsi_exec_channel *src)
256{
257   dst->u[0] = src->u[0];
258   dst->u[1] = src->u[1];
259   dst->u[2] = src->u[2];
260   dst->u[3] = src->u[3];
261}
262
263static void
264micro_rcp(union tgsi_exec_channel *dst,
265          const union tgsi_exec_channel *src)
266{
267#if 0 /* for debugging */
268   assert(src->f[0] != 0.0f);
269   assert(src->f[1] != 0.0f);
270   assert(src->f[2] != 0.0f);
271   assert(src->f[3] != 0.0f);
272#endif
273   dst->f[0] = 1.0f / src->f[0];
274   dst->f[1] = 1.0f / src->f[1];
275   dst->f[2] = 1.0f / src->f[2];
276   dst->f[3] = 1.0f / src->f[3];
277}
278
279static void
280micro_rnd(union tgsi_exec_channel *dst,
281          const union tgsi_exec_channel *src)
282{
283   dst->f[0] = floorf(src->f[0] + 0.5f);
284   dst->f[1] = floorf(src->f[1] + 0.5f);
285   dst->f[2] = floorf(src->f[2] + 0.5f);
286   dst->f[3] = floorf(src->f[3] + 0.5f);
287}
288
289static void
290micro_rsq(union tgsi_exec_channel *dst,
291          const union tgsi_exec_channel *src)
292{
293#if 0 /* for debugging */
294   assert(src->f[0] != 0.0f);
295   assert(src->f[1] != 0.0f);
296   assert(src->f[2] != 0.0f);
297   assert(src->f[3] != 0.0f);
298#endif
299   dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
300   dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
301   dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
302   dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
303}
304
305static void
306micro_seq(union tgsi_exec_channel *dst,
307          const union tgsi_exec_channel *src)
308{
309   dst->f[0] = src[0].f[0] == src[1].f[0] ? 1.0f : 0.0f;
310   dst->f[1] = src[0].f[1] == src[1].f[1] ? 1.0f : 0.0f;
311   dst->f[2] = src[0].f[2] == src[1].f[2] ? 1.0f : 0.0f;
312   dst->f[3] = src[0].f[3] == src[1].f[3] ? 1.0f : 0.0f;
313}
314
315static void
316micro_sge(union tgsi_exec_channel *dst,
317          const union tgsi_exec_channel *src)
318{
319   dst->f[0] = src[0].f[0] >= src[1].f[0] ? 1.0f : 0.0f;
320   dst->f[1] = src[0].f[1] >= src[1].f[1] ? 1.0f : 0.0f;
321   dst->f[2] = src[0].f[2] >= src[1].f[2] ? 1.0f : 0.0f;
322   dst->f[3] = src[0].f[3] >= src[1].f[3] ? 1.0f : 0.0f;
323}
324
325static void
326micro_sgn(union tgsi_exec_channel *dst,
327          const union tgsi_exec_channel *src)
328{
329   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
330   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
331   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
332   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
333}
334
335static void
336micro_sgt(union tgsi_exec_channel *dst,
337          const union tgsi_exec_channel *src)
338{
339   dst->f[0] = src[0].f[0] > src[1].f[0] ? 1.0f : 0.0f;
340   dst->f[1] = src[0].f[1] > src[1].f[1] ? 1.0f : 0.0f;
341   dst->f[2] = src[0].f[2] > src[1].f[2] ? 1.0f : 0.0f;
342   dst->f[3] = src[0].f[3] > src[1].f[3] ? 1.0f : 0.0f;
343}
344
345static void
346micro_sin(union tgsi_exec_channel *dst,
347          const union tgsi_exec_channel *src)
348{
349   dst->f[0] = sinf(src->f[0]);
350   dst->f[1] = sinf(src->f[1]);
351   dst->f[2] = sinf(src->f[2]);
352   dst->f[3] = sinf(src->f[3]);
353}
354
355static void
356micro_sle(union tgsi_exec_channel *dst,
357          const union tgsi_exec_channel *src)
358{
359   dst->f[0] = src[0].f[0] <= src[1].f[0] ? 1.0f : 0.0f;
360   dst->f[1] = src[0].f[1] <= src[1].f[1] ? 1.0f : 0.0f;
361   dst->f[2] = src[0].f[2] <= src[1].f[2] ? 1.0f : 0.0f;
362   dst->f[3] = src[0].f[3] <= src[1].f[3] ? 1.0f : 0.0f;
363}
364
365static void
366micro_slt(union tgsi_exec_channel *dst,
367          const union tgsi_exec_channel *src)
368{
369   dst->f[0] = src[0].f[0] < src[1].f[0] ? 1.0f : 0.0f;
370   dst->f[1] = src[0].f[1] < src[1].f[1] ? 1.0f : 0.0f;
371   dst->f[2] = src[0].f[2] < src[1].f[2] ? 1.0f : 0.0f;
372   dst->f[3] = src[0].f[3] < src[1].f[3] ? 1.0f : 0.0f;
373}
374
375static void
376micro_sne(union tgsi_exec_channel *dst,
377          const union tgsi_exec_channel *src)
378{
379   dst->f[0] = src[0].f[0] != src[1].f[0] ? 1.0f : 0.0f;
380   dst->f[1] = src[0].f[1] != src[1].f[1] ? 1.0f : 0.0f;
381   dst->f[2] = src[0].f[2] != src[1].f[2] ? 1.0f : 0.0f;
382   dst->f[3] = src[0].f[3] != src[1].f[3] ? 1.0f : 0.0f;
383}
384
385static void
386micro_trunc(union tgsi_exec_channel *dst,
387            const union tgsi_exec_channel *src)
388{
389   dst->f[0] = (float)(int)src->f[0];
390   dst->f[1] = (float)(int)src->f[1];
391   dst->f[2] = (float)(int)src->f[2];
392   dst->f[3] = (float)(int)src->f[3];
393}
394
395
396#define CHAN_X  0
397#define CHAN_Y  1
398#define CHAN_Z  2
399#define CHAN_W  3
400
401enum tgsi_exec_datatype {
402   TGSI_EXEC_DATA_FLOAT,
403   TGSI_EXEC_DATA_INT,
404   TGSI_EXEC_DATA_UINT
405};
406
407/*
408 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
409 */
410#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
411#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
412#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
413#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
414#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
415#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
416#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
417#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
418#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
419#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
420#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
421#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
422#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
423#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
424#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
425#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
426#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
427#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
428#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
429#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
430#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
431#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
432#define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
433#define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
434#define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
435#define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
436#define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
437#define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
438#define TEMP_R0            TGSI_EXEC_TEMP_R0
439#define TEMP_P0            TGSI_EXEC_TEMP_P0
440
441#define IS_CHANNEL_ENABLED(INST, CHAN)\
442   ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
443
444#define IS_CHANNEL_ENABLED2(INST, CHAN)\
445   ((INST).Dst[1].Register.WriteMask & (1 << (CHAN)))
446
447#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
448   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
449      if (IS_CHANNEL_ENABLED( INST, CHAN ))
450
451#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
452   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
453      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
454
455
456/** The execution mask depends on the conditional mask and the loop mask */
457#define UPDATE_EXEC_MASK(MACH) \
458      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
459
460
461static const union tgsi_exec_channel ZeroVec =
462   { { 0.0, 0.0, 0.0, 0.0 } };
463
464static const union tgsi_exec_channel OneVec = {
465   {1.0f, 1.0f, 1.0f, 1.0f}
466};
467
468
469/**
470 * Assert that none of the float values in 'chan' are infinite or NaN.
471 * NaN and Inf may occur normally during program execution and should
472 * not lead to crashes, etc.  But when debugging, it's helpful to catch
473 * them.
474 */
475static INLINE void
476check_inf_or_nan(const union tgsi_exec_channel *chan)
477{
478   assert(!util_is_inf_or_nan((chan)->f[0]));
479   assert(!util_is_inf_or_nan((chan)->f[1]));
480   assert(!util_is_inf_or_nan((chan)->f[2]));
481   assert(!util_is_inf_or_nan((chan)->f[3]));
482}
483
484
485#ifdef DEBUG
486static void
487print_chan(const char *msg, const union tgsi_exec_channel *chan)
488{
489   debug_printf("%s = {%f, %f, %f, %f}\n",
490                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
491}
492#endif
493
494
495#ifdef DEBUG
496static void
497print_temp(const struct tgsi_exec_machine *mach, uint index)
498{
499   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
500   int i;
501   debug_printf("Temp[%u] =\n", index);
502   for (i = 0; i < 4; i++) {
503      debug_printf("  %c: { %f, %f, %f, %f }\n",
504                   "XYZW"[i],
505                   tmp->xyzw[i].f[0],
506                   tmp->xyzw[i].f[1],
507                   tmp->xyzw[i].f[2],
508                   tmp->xyzw[i].f[3]);
509   }
510}
511#endif
512
513
514/**
515 * Check if there's a potential src/dst register data dependency when
516 * using SOA execution.
517 * Example:
518 *   MOV T, T.yxwz;
519 * This would expand into:
520 *   MOV t0, t1;
521 *   MOV t1, t0;
522 *   MOV t2, t3;
523 *   MOV t3, t2;
524 * The second instruction will have the wrong value for t0 if executed as-is.
525 */
526boolean
527tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
528{
529   uint i, chan;
530
531   uint writemask = inst->Dst[0].Register.WriteMask;
532   if (writemask == TGSI_WRITEMASK_X ||
533       writemask == TGSI_WRITEMASK_Y ||
534       writemask == TGSI_WRITEMASK_Z ||
535       writemask == TGSI_WRITEMASK_W ||
536       writemask == TGSI_WRITEMASK_NONE) {
537      /* no chance of data dependency */
538      return FALSE;
539   }
540
541   /* loop over src regs */
542   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
543      if ((inst->Src[i].Register.File ==
544           inst->Dst[0].Register.File) &&
545          (inst->Src[i].Register.Index ==
546           inst->Dst[0].Register.Index)) {
547         /* loop over dest channels */
548         uint channelsWritten = 0x0;
549         FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
550            /* check if we're reading a channel that's been written */
551            uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
552            if (channelsWritten & (1 << swizzle)) {
553               return TRUE;
554            }
555
556            channelsWritten |= (1 << chan);
557         }
558      }
559   }
560   return FALSE;
561}
562
563
564/**
565 * Initialize machine state by expanding tokens to full instructions,
566 * allocating temporary storage, setting up constants, etc.
567 * After this, we can call tgsi_exec_machine_run() many times.
568 */
569void
570tgsi_exec_machine_bind_shader(
571   struct tgsi_exec_machine *mach,
572   const struct tgsi_token *tokens,
573   uint numSamplers,
574   struct tgsi_sampler **samplers)
575{
576   uint k;
577   struct tgsi_parse_context parse;
578   struct tgsi_exec_labels *labels = &mach->Labels;
579   struct tgsi_full_instruction *instructions;
580   struct tgsi_full_declaration *declarations;
581   uint maxInstructions = 10, numInstructions = 0;
582   uint maxDeclarations = 10, numDeclarations = 0;
583   uint instno = 0;
584
585#if 0
586   tgsi_dump(tokens, 0);
587#endif
588
589   util_init_math();
590
591   mach->Tokens = tokens;
592   mach->Samplers = samplers;
593
594   k = tgsi_parse_init (&parse, mach->Tokens);
595   if (k != TGSI_PARSE_OK) {
596      debug_printf( "Problem parsing!\n" );
597      return;
598   }
599
600   mach->Processor = parse.FullHeader.Processor.Processor;
601   mach->ImmLimit = 0;
602   labels->count = 0;
603
604   declarations = (struct tgsi_full_declaration *)
605      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
606
607   if (!declarations) {
608      return;
609   }
610
611   instructions = (struct tgsi_full_instruction *)
612      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
613
614   if (!instructions) {
615      FREE( declarations );
616      return;
617   }
618
619   while( !tgsi_parse_end_of_tokens( &parse ) ) {
620      uint pointer = parse.Position;
621      uint i;
622
623      tgsi_parse_token( &parse );
624      switch( parse.FullToken.Token.Type ) {
625      case TGSI_TOKEN_TYPE_DECLARATION:
626         /* save expanded declaration */
627         if (numDeclarations == maxDeclarations) {
628            declarations = REALLOC(declarations,
629                                   maxDeclarations
630                                   * sizeof(struct tgsi_full_declaration),
631                                   (maxDeclarations + 10)
632                                   * sizeof(struct tgsi_full_declaration));
633            maxDeclarations += 10;
634         }
635         if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
636            unsigned reg;
637            for (reg = parse.FullToken.FullDeclaration.Range.First;
638                 reg <= parse.FullToken.FullDeclaration.Range.Last;
639                 ++reg) {
640               ++mach->NumOutputs;
641            }
642         }
643         memcpy(declarations + numDeclarations,
644                &parse.FullToken.FullDeclaration,
645                sizeof(declarations[0]));
646         numDeclarations++;
647         break;
648
649      case TGSI_TOKEN_TYPE_IMMEDIATE:
650         {
651            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
652            assert( size <= 4 );
653            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
654
655            for( i = 0; i < size; i++ ) {
656               mach->Imms[mach->ImmLimit][i] =
657		  parse.FullToken.FullImmediate.u[i].Float;
658            }
659            mach->ImmLimit += 1;
660         }
661         break;
662
663      case TGSI_TOKEN_TYPE_INSTRUCTION:
664         assert( labels->count < MAX_LABELS );
665
666         labels->labels[labels->count][0] = instno;
667         labels->labels[labels->count][1] = pointer;
668         labels->count++;
669
670         /* save expanded instruction */
671         if (numInstructions == maxInstructions) {
672            instructions = REALLOC(instructions,
673                                   maxInstructions
674                                   * sizeof(struct tgsi_full_instruction),
675                                   (maxInstructions + 10)
676                                   * sizeof(struct tgsi_full_instruction));
677            maxInstructions += 10;
678         }
679
680         memcpy(instructions + numInstructions,
681                &parse.FullToken.FullInstruction,
682                sizeof(instructions[0]));
683
684         numInstructions++;
685         break;
686
687      case TGSI_TOKEN_TYPE_PROPERTY:
688         break;
689
690      default:
691         assert( 0 );
692      }
693   }
694   tgsi_parse_free (&parse);
695
696   if (mach->Declarations) {
697      FREE( mach->Declarations );
698   }
699   mach->Declarations = declarations;
700   mach->NumDeclarations = numDeclarations;
701
702   if (mach->Instructions) {
703      FREE( mach->Instructions );
704   }
705   mach->Instructions = instructions;
706   mach->NumInstructions = numInstructions;
707}
708
709
710struct tgsi_exec_machine *
711tgsi_exec_machine_create( void )
712{
713   struct tgsi_exec_machine *mach;
714   uint i;
715
716   mach = align_malloc( sizeof *mach, 16 );
717   if (!mach)
718      goto fail;
719
720   memset(mach, 0, sizeof(*mach));
721
722   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
723   mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
724   mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
725
726   /* Setup constants. */
727   for( i = 0; i < 4; i++ ) {
728      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
729      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
730      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
731      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
732      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
733      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
734      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
735      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
736      mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
737      mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
738   }
739
740#ifdef DEBUG
741   /* silence warnings */
742   (void) print_chan;
743   (void) print_temp;
744#endif
745
746   return mach;
747
748fail:
749   align_free(mach);
750   return NULL;
751}
752
753
754void
755tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
756{
757   if (mach) {
758      FREE(mach->Instructions);
759      FREE(mach->Declarations);
760   }
761
762   align_free(mach);
763}
764
765static void
766micro_add(
767   union tgsi_exec_channel *dst,
768   const union tgsi_exec_channel *src0,
769   const union tgsi_exec_channel *src1 )
770{
771   dst->f[0] = src0->f[0] + src1->f[0];
772   dst->f[1] = src0->f[1] + src1->f[1];
773   dst->f[2] = src0->f[2] + src1->f[2];
774   dst->f[3] = src0->f[3] + src1->f[3];
775}
776
777static void
778micro_div(
779   union tgsi_exec_channel *dst,
780   const union tgsi_exec_channel *src0,
781   const union tgsi_exec_channel *src1 )
782{
783   if (src1->f[0] != 0) {
784      dst->f[0] = src0->f[0] / src1->f[0];
785   }
786   if (src1->f[1] != 0) {
787      dst->f[1] = src0->f[1] / src1->f[1];
788   }
789   if (src1->f[2] != 0) {
790      dst->f[2] = src0->f[2] / src1->f[2];
791   }
792   if (src1->f[3] != 0) {
793      dst->f[3] = src0->f[3] / src1->f[3];
794   }
795}
796
797static void
798micro_float_clamp(union tgsi_exec_channel *dst,
799                  const union tgsi_exec_channel *src)
800{
801   uint i;
802
803   for (i = 0; i < 4; i++) {
804      if (src->f[i] > 0.0f) {
805         if (src->f[i] > 1.884467e+019f)
806            dst->f[i] = 1.884467e+019f;
807         else if (src->f[i] < 5.42101e-020f)
808            dst->f[i] = 5.42101e-020f;
809         else
810            dst->f[i] = src->f[i];
811      }
812      else {
813         if (src->f[i] < -1.884467e+019f)
814            dst->f[i] = -1.884467e+019f;
815         else if (src->f[i] > -5.42101e-020f)
816            dst->f[i] = -5.42101e-020f;
817         else
818            dst->f[i] = src->f[i];
819      }
820   }
821}
822
823static void
824micro_lt(
825   union tgsi_exec_channel *dst,
826   const union tgsi_exec_channel *src0,
827   const union tgsi_exec_channel *src1,
828   const union tgsi_exec_channel *src2,
829   const union tgsi_exec_channel *src3 )
830{
831   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
832   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
833   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
834   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
835}
836
837static void
838micro_max(
839   union tgsi_exec_channel *dst,
840   const union tgsi_exec_channel *src0,
841   const union tgsi_exec_channel *src1 )
842{
843   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
844   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
845   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
846   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
847}
848
849static void
850micro_min(
851   union tgsi_exec_channel *dst,
852   const union tgsi_exec_channel *src0,
853   const union tgsi_exec_channel *src1 )
854{
855   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
856   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
857   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
858   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
859}
860
861static void
862micro_mul(
863   union tgsi_exec_channel *dst,
864   const union tgsi_exec_channel *src0,
865   const union tgsi_exec_channel *src1 )
866{
867   dst->f[0] = src0->f[0] * src1->f[0];
868   dst->f[1] = src0->f[1] * src1->f[1];
869   dst->f[2] = src0->f[2] * src1->f[2];
870   dst->f[3] = src0->f[3] * src1->f[3];
871}
872
873#if 0
874static void
875micro_imul64(
876   union tgsi_exec_channel *dst0,
877   union tgsi_exec_channel *dst1,
878   const union tgsi_exec_channel *src0,
879   const union tgsi_exec_channel *src1 )
880{
881   dst1->i[0] = src0->i[0] * src1->i[0];
882   dst1->i[1] = src0->i[1] * src1->i[1];
883   dst1->i[2] = src0->i[2] * src1->i[2];
884   dst1->i[3] = src0->i[3] * src1->i[3];
885   dst0->i[0] = 0;
886   dst0->i[1] = 0;
887   dst0->i[2] = 0;
888   dst0->i[3] = 0;
889}
890#endif
891
892#if 0
893static void
894micro_umul64(
895   union tgsi_exec_channel *dst0,
896   union tgsi_exec_channel *dst1,
897   const union tgsi_exec_channel *src0,
898   const union tgsi_exec_channel *src1 )
899{
900   dst1->u[0] = src0->u[0] * src1->u[0];
901   dst1->u[1] = src0->u[1] * src1->u[1];
902   dst1->u[2] = src0->u[2] * src1->u[2];
903   dst1->u[3] = src0->u[3] * src1->u[3];
904   dst0->u[0] = 0;
905   dst0->u[1] = 0;
906   dst0->u[2] = 0;
907   dst0->u[3] = 0;
908}
909#endif
910
911
912#if 0
913static void
914micro_movc(
915   union tgsi_exec_channel *dst,
916   const union tgsi_exec_channel *src0,
917   const union tgsi_exec_channel *src1,
918   const union tgsi_exec_channel *src2 )
919{
920   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
921   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
922   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
923   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
924}
925#endif
926
927static void
928micro_neg(
929   union tgsi_exec_channel *dst,
930   const union tgsi_exec_channel *src )
931{
932   dst->f[0] = -src->f[0];
933   dst->f[1] = -src->f[1];
934   dst->f[2] = -src->f[2];
935   dst->f[3] = -src->f[3];
936}
937
938static void
939micro_pow(
940   union tgsi_exec_channel *dst,
941   const union tgsi_exec_channel *src0,
942   const union tgsi_exec_channel *src1 )
943{
944#if FAST_MATH
945   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
946   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
947   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
948   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
949#else
950   dst->f[0] = powf( src0->f[0], src1->f[0] );
951   dst->f[1] = powf( src0->f[1], src1->f[1] );
952   dst->f[2] = powf( src0->f[2], src1->f[2] );
953   dst->f[3] = powf( src0->f[3], src1->f[3] );
954#endif
955}
956
957static void
958micro_sqrt( union tgsi_exec_channel *dst,
959            const union tgsi_exec_channel *src )
960{
961   dst->f[0] = sqrtf( src->f[0] );
962   dst->f[1] = sqrtf( src->f[1] );
963   dst->f[2] = sqrtf( src->f[2] );
964   dst->f[3] = sqrtf( src->f[3] );
965}
966
967static void
968micro_sub(
969   union tgsi_exec_channel *dst,
970   const union tgsi_exec_channel *src0,
971   const union tgsi_exec_channel *src1 )
972{
973   dst->f[0] = src0->f[0] - src1->f[0];
974   dst->f[1] = src0->f[1] - src1->f[1];
975   dst->f[2] = src0->f[2] - src1->f[2];
976   dst->f[3] = src0->f[3] - src1->f[3];
977}
978
979static void
980fetch_src_file_channel(const struct tgsi_exec_machine *mach,
981                       const uint file,
982                       const uint swizzle,
983                       const union tgsi_exec_channel *index,
984                       const union tgsi_exec_channel *index2D,
985                       union tgsi_exec_channel *chan)
986{
987   uint i;
988
989   switch (file) {
990   case TGSI_FILE_CONSTANT:
991      for (i = 0; i < QUAD_SIZE; i++) {
992         assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
993         assert(mach->Consts[index2D->i[i]]);
994
995         if (index->i[i] < 0) {
996            chan->u[i] = 0;
997         } else {
998            const uint *p = (const uint *)mach->Consts[index2D->i[i]];
999
1000            chan->u[i] = p[index->i[i] * 4 + swizzle];
1001         }
1002      }
1003      break;
1004
1005   case TGSI_FILE_INPUT:
1006   case TGSI_FILE_SYSTEM_VALUE:
1007      for (i = 0; i < QUAD_SIZE; i++) {
1008         /* XXX: 2D indexing */
1009         chan->u[i] = mach->Inputs[index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i]].xyzw[swizzle].u[i];
1010      }
1011      break;
1012
1013   case TGSI_FILE_TEMPORARY:
1014      for (i = 0; i < QUAD_SIZE; i++) {
1015         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1016         assert(index2D->i[i] == 0);
1017
1018         chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1019      }
1020      break;
1021
1022   case TGSI_FILE_IMMEDIATE:
1023      for (i = 0; i < QUAD_SIZE; i++) {
1024         assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1025         assert(index2D->i[i] == 0);
1026
1027         chan->f[i] = mach->Imms[index->i[i]][swizzle];
1028      }
1029      break;
1030
1031   case TGSI_FILE_ADDRESS:
1032      for (i = 0; i < QUAD_SIZE; i++) {
1033         assert(index->i[i] >= 0);
1034         assert(index2D->i[i] == 0);
1035
1036         chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1037      }
1038      break;
1039
1040   case TGSI_FILE_PREDICATE:
1041      for (i = 0; i < QUAD_SIZE; i++) {
1042         assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
1043         assert(index2D->i[i] == 0);
1044
1045         chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
1046      }
1047      break;
1048
1049   case TGSI_FILE_OUTPUT:
1050      /* vertex/fragment output vars can be read too */
1051      for (i = 0; i < QUAD_SIZE; i++) {
1052         assert(index->i[i] >= 0);
1053         assert(index2D->i[i] == 0);
1054
1055         chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1056      }
1057      break;
1058
1059   default:
1060      assert(0);
1061      for (i = 0; i < QUAD_SIZE; i++) {
1062         chan->u[i] = 0;
1063      }
1064   }
1065}
1066
1067static void
1068fetch_source(const struct tgsi_exec_machine *mach,
1069             union tgsi_exec_channel *chan,
1070             const struct tgsi_full_src_register *reg,
1071             const uint chan_index,
1072             enum tgsi_exec_datatype src_datatype)
1073{
1074   union tgsi_exec_channel index;
1075   union tgsi_exec_channel index2D;
1076   uint swizzle;
1077
1078   /* We start with a direct index into a register file.
1079    *
1080    *    file[1],
1081    *    where:
1082    *       file = Register.File
1083    *       [1] = Register.Index
1084    */
1085   index.i[0] =
1086   index.i[1] =
1087   index.i[2] =
1088   index.i[3] = reg->Register.Index;
1089
1090   /* There is an extra source register that indirectly subscripts
1091    * a register file. The direct index now becomes an offset
1092    * that is being added to the indirect register.
1093    *
1094    *    file[ind[2].x+1],
1095    *    where:
1096    *       ind = Indirect.File
1097    *       [2] = Indirect.Index
1098    *       .x = Indirect.SwizzleX
1099    */
1100   if (reg->Register.Indirect) {
1101      union tgsi_exec_channel index2;
1102      union tgsi_exec_channel indir_index;
1103      const uint execmask = mach->ExecMask;
1104      uint i;
1105
1106      /* which address register (always zero now) */
1107      index2.i[0] =
1108      index2.i[1] =
1109      index2.i[2] =
1110      index2.i[3] = reg->Indirect.Index;
1111
1112      /* get current value of address register[swizzle] */
1113      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1114      fetch_src_file_channel(mach,
1115                             reg->Indirect.File,
1116                             swizzle,
1117                             &index2,
1118                             &ZeroVec,
1119                             &indir_index);
1120
1121      /* add value of address register to the offset */
1122      index.i[0] += indir_index.i[0];
1123      index.i[1] += indir_index.i[1];
1124      index.i[2] += indir_index.i[2];
1125      index.i[3] += indir_index.i[3];
1126
1127      /* for disabled execution channels, zero-out the index to
1128       * avoid using a potential garbage value.
1129       */
1130      for (i = 0; i < QUAD_SIZE; i++) {
1131         if ((execmask & (1 << i)) == 0)
1132            index.i[i] = 0;
1133      }
1134   }
1135
1136   /* There is an extra source register that is a second
1137    * subscript to a register file. Effectively it means that
1138    * the register file is actually a 2D array of registers.
1139    *
1140    *    file[3][1],
1141    *    where:
1142    *       [3] = Dimension.Index
1143    */
1144   if (reg->Register.Dimension) {
1145      index2D.i[0] =
1146      index2D.i[1] =
1147      index2D.i[2] =
1148      index2D.i[3] = reg->Dimension.Index;
1149
1150      /* Again, the second subscript index can be addressed indirectly
1151       * identically to the first one.
1152       * Nothing stops us from indirectly addressing the indirect register,
1153       * but there is no need for that, so we won't exercise it.
1154       *
1155       *    file[ind[4].y+3][1],
1156       *    where:
1157       *       ind = DimIndirect.File
1158       *       [4] = DimIndirect.Index
1159       *       .y = DimIndirect.SwizzleX
1160       */
1161      if (reg->Dimension.Indirect) {
1162         union tgsi_exec_channel index2;
1163         union tgsi_exec_channel indir_index;
1164         const uint execmask = mach->ExecMask;
1165         uint i;
1166
1167         index2.i[0] =
1168         index2.i[1] =
1169         index2.i[2] =
1170         index2.i[3] = reg->DimIndirect.Index;
1171
1172         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1173         fetch_src_file_channel(mach,
1174                                reg->DimIndirect.File,
1175                                swizzle,
1176                                &index2,
1177                                &ZeroVec,
1178                                &indir_index);
1179
1180         index2D.i[0] += indir_index.i[0];
1181         index2D.i[1] += indir_index.i[1];
1182         index2D.i[2] += indir_index.i[2];
1183         index2D.i[3] += indir_index.i[3];
1184
1185         /* for disabled execution channels, zero-out the index to
1186          * avoid using a potential garbage value.
1187          */
1188         for (i = 0; i < QUAD_SIZE; i++) {
1189            if ((execmask & (1 << i)) == 0) {
1190               index2D.i[i] = 0;
1191            }
1192         }
1193      }
1194
1195      /* If by any chance there was a need for a 3D array of register
1196       * files, we would have to check whether Dimension is followed
1197       * by a dimension register and continue the saga.
1198       */
1199   } else {
1200      index2D.i[0] =
1201      index2D.i[1] =
1202      index2D.i[2] =
1203      index2D.i[3] = 0;
1204   }
1205
1206   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1207   fetch_src_file_channel(mach,
1208                          reg->Register.File,
1209                          swizzle,
1210                          &index,
1211                          &index2D,
1212                          chan);
1213
1214   if (reg->Register.Absolute) {
1215      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1216         micro_abs(chan, chan);
1217      } else {
1218         micro_iabs(chan, chan);
1219      }
1220   }
1221
1222   if (reg->Register.Negate) {
1223      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1224         micro_neg(chan, chan);
1225      } else {
1226         micro_ineg(chan, chan);
1227      }
1228   }
1229}
1230
1231static void
1232store_dest(struct tgsi_exec_machine *mach,
1233           const union tgsi_exec_channel *chan,
1234           const struct tgsi_full_dst_register *reg,
1235           const struct tgsi_full_instruction *inst,
1236           uint chan_index,
1237           enum tgsi_exec_datatype dst_datatype)
1238{
1239   uint i;
1240   union tgsi_exec_channel null;
1241   union tgsi_exec_channel *dst;
1242   uint execmask = mach->ExecMask;
1243   int offset = 0;  /* indirection offset */
1244   int index;
1245
1246   /* for debugging */
1247   if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1248      check_inf_or_nan(chan);
1249   }
1250
1251   /* There is an extra source register that indirectly subscripts
1252    * a register file. The direct index now becomes an offset
1253    * that is being added to the indirect register.
1254    *
1255    *    file[ind[2].x+1],
1256    *    where:
1257    *       ind = Indirect.File
1258    *       [2] = Indirect.Index
1259    *       .x = Indirect.SwizzleX
1260    */
1261   if (reg->Register.Indirect) {
1262      union tgsi_exec_channel index;
1263      union tgsi_exec_channel indir_index;
1264      uint swizzle;
1265
1266      /* which address register (always zero for now) */
1267      index.i[0] =
1268      index.i[1] =
1269      index.i[2] =
1270      index.i[3] = reg->Indirect.Index;
1271
1272      /* get current value of address register[swizzle] */
1273      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1274
1275      /* fetch values from the address/indirection register */
1276      fetch_src_file_channel(mach,
1277                             reg->Indirect.File,
1278                             swizzle,
1279                             &index,
1280                             &ZeroVec,
1281                             &indir_index);
1282
1283      /* save indirection offset */
1284      offset = indir_index.i[0];
1285   }
1286
1287   switch (reg->Register.File) {
1288   case TGSI_FILE_NULL:
1289      dst = &null;
1290      break;
1291
1292   case TGSI_FILE_OUTPUT:
1293      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1294         + reg->Register.Index;
1295      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1296#if 0
1297      if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1298         fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1299         for (i = 0; i < QUAD_SIZE; i++)
1300            if (execmask & (1 << i))
1301               fprintf(stderr, "%f, ", chan->f[i]);
1302         fprintf(stderr, ")\n");
1303      }
1304#endif
1305      break;
1306
1307   case TGSI_FILE_TEMPORARY:
1308      index = reg->Register.Index;
1309      assert( index < TGSI_EXEC_NUM_TEMPS );
1310      dst = &mach->Temps[offset + index].xyzw[chan_index];
1311      break;
1312
1313   case TGSI_FILE_ADDRESS:
1314      index = reg->Register.Index;
1315      dst = &mach->Addrs[index].xyzw[chan_index];
1316      break;
1317
1318   case TGSI_FILE_LOOP:
1319      assert(reg->Register.Index == 0);
1320      assert(mach->LoopCounterStackTop > 0);
1321      assert(chan_index == CHAN_X);
1322      dst = &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[chan_index];
1323      break;
1324
1325   case TGSI_FILE_PREDICATE:
1326      index = reg->Register.Index;
1327      assert(index < TGSI_EXEC_NUM_PREDS);
1328      dst = &mach->Predicates[index].xyzw[chan_index];
1329      break;
1330
1331   default:
1332      assert( 0 );
1333      return;
1334   }
1335
1336   if (inst->Instruction.Predicate) {
1337      uint swizzle;
1338      union tgsi_exec_channel *pred;
1339
1340      switch (chan_index) {
1341      case CHAN_X:
1342         swizzle = inst->Predicate.SwizzleX;
1343         break;
1344      case CHAN_Y:
1345         swizzle = inst->Predicate.SwizzleY;
1346         break;
1347      case CHAN_Z:
1348         swizzle = inst->Predicate.SwizzleZ;
1349         break;
1350      case CHAN_W:
1351         swizzle = inst->Predicate.SwizzleW;
1352         break;
1353      default:
1354         assert(0);
1355         return;
1356      }
1357
1358      assert(inst->Predicate.Index == 0);
1359
1360      pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1361
1362      if (inst->Predicate.Negate) {
1363         for (i = 0; i < QUAD_SIZE; i++) {
1364            if (pred->u[i]) {
1365               execmask &= ~(1 << i);
1366            }
1367         }
1368      } else {
1369         for (i = 0; i < QUAD_SIZE; i++) {
1370            if (!pred->u[i]) {
1371               execmask &= ~(1 << i);
1372            }
1373         }
1374      }
1375   }
1376
1377   switch (inst->Instruction.Saturate) {
1378   case TGSI_SAT_NONE:
1379      for (i = 0; i < QUAD_SIZE; i++)
1380         if (execmask & (1 << i))
1381            dst->i[i] = chan->i[i];
1382      break;
1383
1384   case TGSI_SAT_ZERO_ONE:
1385      for (i = 0; i < QUAD_SIZE; i++)
1386         if (execmask & (1 << i)) {
1387            if (chan->f[i] < 0.0f)
1388               dst->f[i] = 0.0f;
1389            else if (chan->f[i] > 1.0f)
1390               dst->f[i] = 1.0f;
1391            else
1392               dst->i[i] = chan->i[i];
1393         }
1394      break;
1395
1396   case TGSI_SAT_MINUS_PLUS_ONE:
1397      for (i = 0; i < QUAD_SIZE; i++)
1398         if (execmask & (1 << i)) {
1399            if (chan->f[i] < -1.0f)
1400               dst->f[i] = -1.0f;
1401            else if (chan->f[i] > 1.0f)
1402               dst->f[i] = 1.0f;
1403            else
1404               dst->i[i] = chan->i[i];
1405         }
1406      break;
1407
1408   default:
1409      assert( 0 );
1410   }
1411}
1412
1413#define FETCH(VAL,INDEX,CHAN)\
1414    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1415
1416#define STORE(VAL,INDEX,CHAN)\
1417   store_dest(mach, VAL, &inst->Dst[INDEX], inst, CHAN, TGSI_EXEC_DATA_FLOAT)
1418
1419
1420/**
1421 * Execute ARB-style KIL which is predicated by a src register.
1422 * Kill fragment if any of the four values is less than zero.
1423 */
1424static void
1425exec_kil(struct tgsi_exec_machine *mach,
1426         const struct tgsi_full_instruction *inst)
1427{
1428   uint uniquemask;
1429   uint chan_index;
1430   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1431   union tgsi_exec_channel r[1];
1432
1433   /* This mask stores component bits that were already tested. */
1434   uniquemask = 0;
1435
1436   for (chan_index = 0; chan_index < 4; chan_index++)
1437   {
1438      uint swizzle;
1439      uint i;
1440
1441      /* unswizzle channel */
1442      swizzle = tgsi_util_get_full_src_register_swizzle (
1443                        &inst->Src[0],
1444                        chan_index);
1445
1446      /* check if the component has not been already tested */
1447      if (uniquemask & (1 << swizzle))
1448         continue;
1449      uniquemask |= 1 << swizzle;
1450
1451      FETCH(&r[0], 0, chan_index);
1452      for (i = 0; i < 4; i++)
1453         if (r[0].f[i] < 0.0f)
1454            kilmask |= 1 << i;
1455   }
1456
1457   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1458}
1459
1460/**
1461 * Execute NVIDIA-style KIL which is predicated by a condition code.
1462 * Kill fragment if the condition code is TRUE.
1463 */
1464static void
1465exec_kilp(struct tgsi_exec_machine *mach,
1466          const struct tgsi_full_instruction *inst)
1467{
1468   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1469
1470   /* "unconditional" kil */
1471   kilmask = mach->ExecMask;
1472   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1473}
1474
1475static void
1476emit_vertex(struct tgsi_exec_machine *mach)
1477{
1478   /* FIXME: check for exec mask correctly
1479   unsigned i;
1480   for (i = 0; i < QUAD_SIZE; ++i) {
1481         if ((mach->ExecMask & (1 << i)))
1482   */
1483   if (mach->ExecMask) {
1484      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1485      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1486   }
1487}
1488
1489static void
1490emit_primitive(struct tgsi_exec_machine *mach)
1491{
1492   unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1493   /* FIXME: check for exec mask correctly
1494   unsigned i;
1495   for (i = 0; i < QUAD_SIZE; ++i) {
1496         if ((mach->ExecMask & (1 << i)))
1497   */
1498   if (mach->ExecMask) {
1499      ++(*prim_count);
1500      debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1501      mach->Primitives[*prim_count] = 0;
1502   }
1503}
1504
1505/*
1506 * Fetch four texture samples using STR texture coordinates.
1507 */
1508static void
1509fetch_texel( struct tgsi_sampler *sampler,
1510             const union tgsi_exec_channel *s,
1511             const union tgsi_exec_channel *t,
1512             const union tgsi_exec_channel *p,
1513             const union tgsi_exec_channel *c0,
1514             enum tgsi_sampler_control control,
1515             union tgsi_exec_channel *r,
1516             union tgsi_exec_channel *g,
1517             union tgsi_exec_channel *b,
1518             union tgsi_exec_channel *a )
1519{
1520   uint j;
1521   float rgba[NUM_CHANNELS][QUAD_SIZE];
1522
1523   sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
1524
1525   for (j = 0; j < 4; j++) {
1526      r->f[j] = rgba[0][j];
1527      g->f[j] = rgba[1][j];
1528      b->f[j] = rgba[2][j];
1529      a->f[j] = rgba[3][j];
1530   }
1531}
1532
1533
1534#define TEX_MODIFIER_NONE           0
1535#define TEX_MODIFIER_PROJECTED      1
1536#define TEX_MODIFIER_LOD_BIAS       2
1537#define TEX_MODIFIER_EXPLICIT_LOD   3
1538
1539
1540static void
1541exec_tex(struct tgsi_exec_machine *mach,
1542         const struct tgsi_full_instruction *inst,
1543         uint modifier)
1544{
1545   const uint unit = inst->Src[1].Register.Index;
1546   union tgsi_exec_channel r[4];
1547   const union tgsi_exec_channel *lod = &ZeroVec;
1548   enum tgsi_sampler_control control;
1549   uint chan_index;
1550
1551   if (modifier != TEX_MODIFIER_NONE) {
1552      FETCH(&r[3], 0, CHAN_W);
1553      if (modifier != TEX_MODIFIER_PROJECTED) {
1554         lod = &r[3];
1555      }
1556   }
1557
1558   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
1559      control = tgsi_sampler_lod_explicit;
1560   } else {
1561      control = tgsi_sampler_lod_bias;
1562   }
1563
1564   switch (inst->Texture.Texture) {
1565   case TGSI_TEXTURE_1D:
1566   case TGSI_TEXTURE_SHADOW1D:
1567      FETCH(&r[0], 0, CHAN_X);
1568
1569      if (modifier == TEX_MODIFIER_PROJECTED) {
1570         micro_div(&r[0], &r[0], &r[3]);
1571      }
1572
1573      fetch_texel(mach->Samplers[unit],
1574                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
1575                  control,
1576                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1577      break;
1578
1579   case TGSI_TEXTURE_2D:
1580   case TGSI_TEXTURE_RECT:
1581   case TGSI_TEXTURE_SHADOW2D:
1582   case TGSI_TEXTURE_SHADOWRECT:
1583      FETCH(&r[0], 0, CHAN_X);
1584      FETCH(&r[1], 0, CHAN_Y);
1585      FETCH(&r[2], 0, CHAN_Z);
1586
1587      if (modifier == TEX_MODIFIER_PROJECTED) {
1588         micro_div(&r[0], &r[0], &r[3]);
1589         micro_div(&r[1], &r[1], &r[3]);
1590         micro_div(&r[2], &r[2], &r[3]);
1591      }
1592
1593      fetch_texel(mach->Samplers[unit],
1594                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1595                  control,
1596                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1597      break;
1598
1599   case TGSI_TEXTURE_3D:
1600   case TGSI_TEXTURE_CUBE:
1601      FETCH(&r[0], 0, CHAN_X);
1602      FETCH(&r[1], 0, CHAN_Y);
1603      FETCH(&r[2], 0, CHAN_Z);
1604
1605      if (modifier == TEX_MODIFIER_PROJECTED) {
1606         micro_div(&r[0], &r[0], &r[3]);
1607         micro_div(&r[1], &r[1], &r[3]);
1608         micro_div(&r[2], &r[2], &r[3]);
1609      }
1610
1611      fetch_texel(mach->Samplers[unit],
1612                  &r[0], &r[1], &r[2], lod,
1613                  control,
1614                  &r[0], &r[1], &r[2], &r[3]);
1615      break;
1616
1617   default:
1618      assert(0);
1619   }
1620
1621   FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1622      STORE(&r[chan_index], 0, chan_index);
1623   }
1624}
1625
1626static void
1627exec_txd(struct tgsi_exec_machine *mach,
1628         const struct tgsi_full_instruction *inst)
1629{
1630   const uint unit = inst->Src[3].Register.Index;
1631   union tgsi_exec_channel r[4];
1632   uint chan_index;
1633
1634   /*
1635    * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1636    */
1637
1638   switch (inst->Texture.Texture) {
1639   case TGSI_TEXTURE_1D:
1640   case TGSI_TEXTURE_SHADOW1D:
1641
1642      FETCH(&r[0], 0, CHAN_X);
1643
1644      fetch_texel(mach->Samplers[unit],
1645                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
1646                  tgsi_sampler_lod_bias,
1647                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
1648      break;
1649
1650   case TGSI_TEXTURE_2D:
1651   case TGSI_TEXTURE_RECT:
1652   case TGSI_TEXTURE_SHADOW2D:
1653   case TGSI_TEXTURE_SHADOWRECT:
1654
1655      FETCH(&r[0], 0, CHAN_X);
1656      FETCH(&r[1], 0, CHAN_Y);
1657      FETCH(&r[2], 0, CHAN_Z);
1658
1659      fetch_texel(mach->Samplers[unit],
1660                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
1661                  tgsi_sampler_lod_bias,
1662                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
1663      break;
1664
1665   case TGSI_TEXTURE_3D:
1666   case TGSI_TEXTURE_CUBE:
1667
1668      FETCH(&r[0], 0, CHAN_X);
1669      FETCH(&r[1], 0, CHAN_Y);
1670      FETCH(&r[2], 0, CHAN_Z);
1671
1672      fetch_texel(mach->Samplers[unit],
1673                  &r[0], &r[1], &r[2], &ZeroVec,
1674                  tgsi_sampler_lod_bias,
1675                  &r[0], &r[1], &r[2], &r[3]);
1676      break;
1677
1678   default:
1679      assert(0);
1680   }
1681
1682   FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1683      STORE(&r[chan_index], 0, chan_index);
1684   }
1685}
1686
1687
1688/**
1689 * Evaluate a constant-valued coefficient at the position of the
1690 * current quad.
1691 */
1692static void
1693eval_constant_coef(
1694   struct tgsi_exec_machine *mach,
1695   unsigned attrib,
1696   unsigned chan )
1697{
1698   unsigned i;
1699
1700   for( i = 0; i < QUAD_SIZE; i++ ) {
1701      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1702   }
1703}
1704
1705/**
1706 * Evaluate a linear-valued coefficient at the position of the
1707 * current quad.
1708 */
1709static void
1710eval_linear_coef(
1711   struct tgsi_exec_machine *mach,
1712   unsigned attrib,
1713   unsigned chan )
1714{
1715   const float x = mach->QuadPos.xyzw[0].f[0];
1716   const float y = mach->QuadPos.xyzw[1].f[0];
1717   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1718   const float dady = mach->InterpCoefs[attrib].dady[chan];
1719   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1720   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1721   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1722   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1723   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1724}
1725
1726/**
1727 * Evaluate a perspective-valued coefficient at the position of the
1728 * current quad.
1729 */
1730static void
1731eval_perspective_coef(
1732   struct tgsi_exec_machine *mach,
1733   unsigned attrib,
1734   unsigned chan )
1735{
1736   const float x = mach->QuadPos.xyzw[0].f[0];
1737   const float y = mach->QuadPos.xyzw[1].f[0];
1738   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1739   const float dady = mach->InterpCoefs[attrib].dady[chan];
1740   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1741   const float *w = mach->QuadPos.xyzw[3].f;
1742   /* divide by W here */
1743   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1744   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1745   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1746   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1747}
1748
1749
1750typedef void (* eval_coef_func)(
1751   struct tgsi_exec_machine *mach,
1752   unsigned attrib,
1753   unsigned chan );
1754
1755static void
1756exec_declaration(struct tgsi_exec_machine *mach,
1757                 const struct tgsi_full_declaration *decl)
1758{
1759   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
1760      if (decl->Declaration.File == TGSI_FILE_INPUT ||
1761          decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1762         uint first, last, mask;
1763
1764         first = decl->Range.First;
1765         last = decl->Range.Last;
1766         mask = decl->Declaration.UsageMask;
1767
1768         if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
1769            uint i;
1770
1771            assert(decl->Semantic.Index == 0);
1772            assert(first == last);
1773
1774            for (i = 0; i < QUAD_SIZE; i++) {
1775               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
1776            }
1777         } else {
1778            eval_coef_func eval;
1779            uint i, j;
1780
1781            switch (decl->Declaration.Interpolate) {
1782            case TGSI_INTERPOLATE_CONSTANT:
1783               eval = eval_constant_coef;
1784               break;
1785
1786            case TGSI_INTERPOLATE_LINEAR:
1787               eval = eval_linear_coef;
1788               break;
1789
1790            case TGSI_INTERPOLATE_PERSPECTIVE:
1791               eval = eval_perspective_coef;
1792               break;
1793
1794            default:
1795               assert(0);
1796               return;
1797            }
1798
1799            for (j = 0; j < NUM_CHANNELS; j++) {
1800               if (mask & (1 << j)) {
1801                  for (i = first; i <= last; i++) {
1802                     eval(mach, i, j);
1803                  }
1804               }
1805            }
1806         }
1807      }
1808   }
1809}
1810
1811typedef void (* micro_op)(union tgsi_exec_channel *dst,
1812                          const union tgsi_exec_channel *src);
1813
1814static void
1815exec_scalar_unary(struct tgsi_exec_machine *mach,
1816                  const struct tgsi_full_instruction *inst,
1817                  micro_op op,
1818                  enum tgsi_exec_datatype dst_datatype,
1819                  enum tgsi_exec_datatype src_datatype)
1820{
1821   unsigned int chan;
1822   union tgsi_exec_channel src;
1823   union tgsi_exec_channel dst;
1824
1825   fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
1826   op(&dst, &src);
1827   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1828      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1829         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
1830      }
1831   }
1832}
1833
1834static void
1835exec_vector_unary(struct tgsi_exec_machine *mach,
1836                  const struct tgsi_full_instruction *inst,
1837                  micro_op op,
1838                  enum tgsi_exec_datatype dst_datatype,
1839                  enum tgsi_exec_datatype src_datatype)
1840{
1841   unsigned int chan;
1842   struct tgsi_exec_vector dst;
1843
1844   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1845      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1846         union tgsi_exec_channel src;
1847
1848         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
1849         op(&dst.xyzw[chan], &src);
1850      }
1851   }
1852   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1853      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1854         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1855      }
1856   }
1857}
1858
1859static void
1860exec_vector_binary(struct tgsi_exec_machine *mach,
1861                   const struct tgsi_full_instruction *inst,
1862                   micro_op op,
1863                   enum tgsi_exec_datatype dst_datatype,
1864                   enum tgsi_exec_datatype src_datatype)
1865{
1866   unsigned int chan;
1867   struct tgsi_exec_vector dst;
1868
1869   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1870      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1871         union tgsi_exec_channel src[2];
1872
1873         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
1874         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
1875         op(&dst.xyzw[chan], src);
1876      }
1877   }
1878   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1879      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1880         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1881      }
1882   }
1883}
1884
1885static void
1886exec_vector_trinary(struct tgsi_exec_machine *mach,
1887                    const struct tgsi_full_instruction *inst,
1888                    micro_op op,
1889                    enum tgsi_exec_datatype dst_datatype,
1890                    enum tgsi_exec_datatype src_datatype)
1891{
1892   unsigned int chan;
1893   struct tgsi_exec_vector dst;
1894
1895   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1896      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1897         union tgsi_exec_channel src[3];
1898
1899         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
1900         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
1901         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
1902         op(&dst.xyzw[chan], src);
1903      }
1904   }
1905   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1906      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1907         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1908      }
1909   }
1910}
1911
1912static void
1913exec_dp3(struct tgsi_exec_machine *mach,
1914         const struct tgsi_full_instruction *inst)
1915{
1916   unsigned int chan;
1917   union tgsi_exec_channel arg[3];
1918
1919   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1920   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1921   micro_mul(&arg[2], &arg[0], &arg[1]);
1922
1923   for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
1924      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
1925      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
1926      micro_mad(&arg[2], arg);
1927   }
1928
1929   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1930      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1931         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1932      }
1933   }
1934}
1935
1936static void
1937exec_dp4(struct tgsi_exec_machine *mach,
1938         const struct tgsi_full_instruction *inst)
1939{
1940   unsigned int chan;
1941   union tgsi_exec_channel arg[3];
1942
1943   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1944   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1945   micro_mul(&arg[2], &arg[0], &arg[1]);
1946
1947   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
1948      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
1949      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
1950      micro_mad(&arg[2], arg);
1951   }
1952
1953   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1954      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1955         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1956      }
1957   }
1958}
1959
1960static void
1961exec_dp2a(struct tgsi_exec_machine *mach,
1962          const struct tgsi_full_instruction *inst)
1963{
1964   unsigned int chan;
1965   union tgsi_exec_channel arg[3];
1966
1967   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1968   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1969   micro_mul(&arg[2], &arg[0], &arg[1]);
1970
1971   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1972   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1973   micro_mad(&arg[0], arg);
1974
1975   fetch_source(mach, &arg[1], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1976   micro_add(&arg[0], &arg[0], &arg[1]);
1977
1978   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1979      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1980         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1981      }
1982   }
1983}
1984
1985static void
1986exec_dph(struct tgsi_exec_machine *mach,
1987         const struct tgsi_full_instruction *inst)
1988{
1989   unsigned int chan;
1990   union tgsi_exec_channel arg[3];
1991
1992   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1993   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1994   micro_mul(&arg[2], &arg[0], &arg[1]);
1995
1996   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1997   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
1998   micro_mad(&arg[2], arg);
1999
2000   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2001   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2002   micro_mad(&arg[0], arg);
2003
2004   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2005   micro_add(&arg[0], &arg[0], &arg[1]);
2006
2007   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2008      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2009         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2010      }
2011   }
2012}
2013
2014static void
2015exec_dp2(struct tgsi_exec_machine *mach,
2016         const struct tgsi_full_instruction *inst)
2017{
2018   unsigned int chan;
2019   union tgsi_exec_channel arg[3];
2020
2021   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2022   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2023   micro_mul(&arg[2], &arg[0], &arg[1]);
2024
2025   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2026   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2027   micro_mad(&arg[2], arg);
2028
2029   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2030      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2031         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2032      }
2033   }
2034}
2035
2036static void
2037exec_nrm4(struct tgsi_exec_machine *mach,
2038          const struct tgsi_full_instruction *inst)
2039{
2040   unsigned int chan;
2041   union tgsi_exec_channel arg[4];
2042   union tgsi_exec_channel scale;
2043
2044   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2045   micro_mul(&scale, &arg[0], &arg[0]);
2046
2047   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2048      union tgsi_exec_channel product;
2049
2050      fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2051      micro_mul(&product, &arg[chan], &arg[chan]);
2052      micro_add(&scale, &scale, &product);
2053   }
2054
2055   micro_rsq(&scale, &scale);
2056
2057   for (chan = CHAN_X; chan <= CHAN_W; chan++) {
2058      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2059         micro_mul(&arg[chan], &arg[chan], &scale);
2060         store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2061      }
2062   }
2063}
2064
2065static void
2066exec_nrm3(struct tgsi_exec_machine *mach,
2067          const struct tgsi_full_instruction *inst)
2068{
2069   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2070      unsigned int chan;
2071      union tgsi_exec_channel arg[3];
2072      union tgsi_exec_channel scale;
2073
2074      fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2075      micro_mul(&scale, &arg[0], &arg[0]);
2076
2077      for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2078         union tgsi_exec_channel product;
2079
2080         fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2081         micro_mul(&product, &arg[chan], &arg[chan]);
2082         micro_add(&scale, &scale, &product);
2083      }
2084
2085      micro_rsq(&scale, &scale);
2086
2087      for (chan = CHAN_X; chan <= CHAN_Z; chan++) {
2088         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2089            micro_mul(&arg[chan], &arg[chan], &scale);
2090            store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2091         }
2092      }
2093   }
2094
2095   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2096      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2097   }
2098}
2099
2100static void
2101exec_break(struct tgsi_exec_machine *mach)
2102{
2103   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
2104      /* turn off loop channels for each enabled exec channel */
2105      mach->LoopMask &= ~mach->ExecMask;
2106      /* Todo: if mach->LoopMask == 0, jump to end of loop */
2107      UPDATE_EXEC_MASK(mach);
2108   } else {
2109      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
2110
2111      mach->Switch.mask = 0x0;
2112
2113      UPDATE_EXEC_MASK(mach);
2114   }
2115}
2116
2117static void
2118exec_switch(struct tgsi_exec_machine *mach,
2119            const struct tgsi_full_instruction *inst)
2120{
2121   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
2122   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
2123
2124   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
2125   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2126   mach->Switch.mask = 0x0;
2127   mach->Switch.defaultMask = 0x0;
2128
2129   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
2130   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
2131
2132   UPDATE_EXEC_MASK(mach);
2133}
2134
2135static void
2136exec_case(struct tgsi_exec_machine *mach,
2137          const struct tgsi_full_instruction *inst)
2138{
2139   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2140   union tgsi_exec_channel src;
2141   uint mask = 0;
2142
2143   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2144
2145   if (mach->Switch.selector.u[0] == src.u[0]) {
2146      mask |= 0x1;
2147   }
2148   if (mach->Switch.selector.u[1] == src.u[1]) {
2149      mask |= 0x2;
2150   }
2151   if (mach->Switch.selector.u[2] == src.u[2]) {
2152      mask |= 0x4;
2153   }
2154   if (mach->Switch.selector.u[3] == src.u[3]) {
2155      mask |= 0x8;
2156   }
2157
2158   mach->Switch.defaultMask |= mask;
2159
2160   mach->Switch.mask |= mask & prevMask;
2161
2162   UPDATE_EXEC_MASK(mach);
2163}
2164
2165static void
2166exec_default(struct tgsi_exec_machine *mach)
2167{
2168   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2169
2170   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
2171
2172   UPDATE_EXEC_MASK(mach);
2173}
2174
2175static void
2176exec_endswitch(struct tgsi_exec_machine *mach)
2177{
2178   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
2179   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
2180
2181   UPDATE_EXEC_MASK(mach);
2182}
2183
2184static void
2185micro_i2f(union tgsi_exec_channel *dst,
2186          const union tgsi_exec_channel *src)
2187{
2188   dst->f[0] = (float)src->i[0];
2189   dst->f[1] = (float)src->i[1];
2190   dst->f[2] = (float)src->i[2];
2191   dst->f[3] = (float)src->i[3];
2192}
2193
2194static void
2195micro_not(union tgsi_exec_channel *dst,
2196          const union tgsi_exec_channel *src)
2197{
2198   dst->u[0] = ~src->u[0];
2199   dst->u[1] = ~src->u[1];
2200   dst->u[2] = ~src->u[2];
2201   dst->u[3] = ~src->u[3];
2202}
2203
2204static void
2205micro_shl(union tgsi_exec_channel *dst,
2206          const union tgsi_exec_channel *src)
2207{
2208   dst->u[0] = src[0].u[0] << src[1].u[0];
2209   dst->u[1] = src[0].u[1] << src[1].u[1];
2210   dst->u[2] = src[0].u[2] << src[1].u[2];
2211   dst->u[3] = src[0].u[3] << src[1].u[3];
2212}
2213
2214static void
2215micro_and(union tgsi_exec_channel *dst,
2216          const union tgsi_exec_channel *src)
2217{
2218   dst->u[0] = src[0].u[0] & src[1].u[0];
2219   dst->u[1] = src[0].u[1] & src[1].u[1];
2220   dst->u[2] = src[0].u[2] & src[1].u[2];
2221   dst->u[3] = src[0].u[3] & src[1].u[3];
2222}
2223
2224static void
2225micro_or(union tgsi_exec_channel *dst,
2226         const union tgsi_exec_channel *src)
2227{
2228   dst->u[0] = src[0].u[0] | src[1].u[0];
2229   dst->u[1] = src[0].u[1] | src[1].u[1];
2230   dst->u[2] = src[0].u[2] | src[1].u[2];
2231   dst->u[3] = src[0].u[3] | src[1].u[3];
2232}
2233
2234static void
2235micro_xor(union tgsi_exec_channel *dst,
2236          const union tgsi_exec_channel *src)
2237{
2238   dst->u[0] = src[0].u[0] ^ src[1].u[0];
2239   dst->u[1] = src[0].u[1] ^ src[1].u[1];
2240   dst->u[2] = src[0].u[2] ^ src[1].u[2];
2241   dst->u[3] = src[0].u[3] ^ src[1].u[3];
2242}
2243
2244static void
2245micro_f2i(union tgsi_exec_channel *dst,
2246          const union tgsi_exec_channel *src)
2247{
2248   dst->i[0] = (int)src->f[0];
2249   dst->i[1] = (int)src->f[1];
2250   dst->i[2] = (int)src->f[2];
2251   dst->i[3] = (int)src->f[3];
2252}
2253
2254static void
2255micro_idiv(union tgsi_exec_channel *dst,
2256           const union tgsi_exec_channel *src)
2257{
2258   dst->i[0] = src[0].i[0] / src[1].i[0];
2259   dst->i[1] = src[0].i[1] / src[1].i[1];
2260   dst->i[2] = src[0].i[2] / src[1].i[2];
2261   dst->i[3] = src[0].i[3] / src[1].i[3];
2262}
2263
2264static void
2265micro_imax(union tgsi_exec_channel *dst,
2266           const union tgsi_exec_channel *src)
2267{
2268   dst->i[0] = src[0].i[0] > src[1].i[0] ? src[0].i[0] : src[1].i[0];
2269   dst->i[1] = src[0].i[1] > src[1].i[1] ? src[0].i[1] : src[1].i[1];
2270   dst->i[2] = src[0].i[2] > src[1].i[2] ? src[0].i[2] : src[1].i[2];
2271   dst->i[3] = src[0].i[3] > src[1].i[3] ? src[0].i[3] : src[1].i[3];
2272}
2273
2274static void
2275micro_imin(union tgsi_exec_channel *dst,
2276           const union tgsi_exec_channel *src)
2277{
2278   dst->i[0] = src[0].i[0] < src[1].i[0] ? src[0].i[0] : src[1].i[0];
2279   dst->i[1] = src[0].i[1] < src[1].i[1] ? src[0].i[1] : src[1].i[1];
2280   dst->i[2] = src[0].i[2] < src[1].i[2] ? src[0].i[2] : src[1].i[2];
2281   dst->i[3] = src[0].i[3] < src[1].i[3] ? src[0].i[3] : src[1].i[3];
2282}
2283
2284static void
2285micro_isge(union tgsi_exec_channel *dst,
2286           const union tgsi_exec_channel *src)
2287{
2288   dst->i[0] = src[0].i[0] >= src[1].i[0] ? -1 : 0;
2289   dst->i[1] = src[0].i[1] >= src[1].i[1] ? -1 : 0;
2290   dst->i[2] = src[0].i[2] >= src[1].i[2] ? -1 : 0;
2291   dst->i[3] = src[0].i[3] >= src[1].i[3] ? -1 : 0;
2292}
2293
2294static void
2295micro_ishr(union tgsi_exec_channel *dst,
2296           const union tgsi_exec_channel *src)
2297{
2298   dst->i[0] = src[0].i[0] >> src[1].i[0];
2299   dst->i[1] = src[0].i[1] >> src[1].i[1];
2300   dst->i[2] = src[0].i[2] >> src[1].i[2];
2301   dst->i[3] = src[0].i[3] >> src[1].i[3];
2302}
2303
2304static void
2305micro_islt(union tgsi_exec_channel *dst,
2306           const union tgsi_exec_channel *src)
2307{
2308   dst->i[0] = src[0].i[0] < src[1].i[0] ? -1 : 0;
2309   dst->i[1] = src[0].i[1] < src[1].i[1] ? -1 : 0;
2310   dst->i[2] = src[0].i[2] < src[1].i[2] ? -1 : 0;
2311   dst->i[3] = src[0].i[3] < src[1].i[3] ? -1 : 0;
2312}
2313
2314static void
2315micro_f2u(union tgsi_exec_channel *dst,
2316          const union tgsi_exec_channel *src)
2317{
2318   dst->u[0] = (uint)src->f[0];
2319   dst->u[1] = (uint)src->f[1];
2320   dst->u[2] = (uint)src->f[2];
2321   dst->u[3] = (uint)src->f[3];
2322}
2323
2324static void
2325micro_u2f(union tgsi_exec_channel *dst,
2326          const union tgsi_exec_channel *src)
2327{
2328   dst->f[0] = (float)src->u[0];
2329   dst->f[1] = (float)src->u[1];
2330   dst->f[2] = (float)src->u[2];
2331   dst->f[3] = (float)src->u[3];
2332}
2333
2334static void
2335micro_uadd(union tgsi_exec_channel *dst,
2336           const union tgsi_exec_channel *src)
2337{
2338   dst->u[0] = src[0].u[0] + src[1].u[0];
2339   dst->u[1] = src[0].u[1] + src[1].u[1];
2340   dst->u[2] = src[0].u[2] + src[1].u[2];
2341   dst->u[3] = src[0].u[3] + src[1].u[3];
2342}
2343
2344static void
2345micro_udiv(union tgsi_exec_channel *dst,
2346           const union tgsi_exec_channel *src)
2347{
2348   dst->u[0] = src[0].u[0] / src[1].u[0];
2349   dst->u[1] = src[0].u[1] / src[1].u[1];
2350   dst->u[2] = src[0].u[2] / src[1].u[2];
2351   dst->u[3] = src[0].u[3] / src[1].u[3];
2352}
2353
2354static void
2355micro_umad(union tgsi_exec_channel *dst,
2356           const union tgsi_exec_channel *src)
2357{
2358   dst->u[0] = src[0].u[0] * src[1].u[0] + src[2].u[0];
2359   dst->u[1] = src[0].u[1] * src[1].u[1] + src[2].u[1];
2360   dst->u[2] = src[0].u[2] * src[1].u[2] + src[2].u[2];
2361   dst->u[3] = src[0].u[3] * src[1].u[3] + src[2].u[3];
2362}
2363
2364static void
2365micro_umax(union tgsi_exec_channel *dst,
2366           const union tgsi_exec_channel *src)
2367{
2368   dst->u[0] = src[0].u[0] > src[1].u[0] ? src[0].u[0] : src[1].u[0];
2369   dst->u[1] = src[0].u[1] > src[1].u[1] ? src[0].u[1] : src[1].u[1];
2370   dst->u[2] = src[0].u[2] > src[1].u[2] ? src[0].u[2] : src[1].u[2];
2371   dst->u[3] = src[0].u[3] > src[1].u[3] ? src[0].u[3] : src[1].u[3];
2372}
2373
2374static void
2375micro_umin(union tgsi_exec_channel *dst,
2376           const union tgsi_exec_channel *src)
2377{
2378   dst->u[0] = src[0].u[0] < src[1].u[0] ? src[0].u[0] : src[1].u[0];
2379   dst->u[1] = src[0].u[1] < src[1].u[1] ? src[0].u[1] : src[1].u[1];
2380   dst->u[2] = src[0].u[2] < src[1].u[2] ? src[0].u[2] : src[1].u[2];
2381   dst->u[3] = src[0].u[3] < src[1].u[3] ? src[0].u[3] : src[1].u[3];
2382}
2383
2384static void
2385micro_umod(union tgsi_exec_channel *dst,
2386           const union tgsi_exec_channel *src)
2387{
2388   dst->u[0] = src[0].u[0] % src[1].u[0];
2389   dst->u[1] = src[0].u[1] % src[1].u[1];
2390   dst->u[2] = src[0].u[2] % src[1].u[2];
2391   dst->u[3] = src[0].u[3] % src[1].u[3];
2392}
2393
2394static void
2395micro_umul(union tgsi_exec_channel *dst,
2396           const union tgsi_exec_channel *src)
2397{
2398   dst->u[0] = src[0].u[0] * src[1].u[0];
2399   dst->u[1] = src[0].u[1] * src[1].u[1];
2400   dst->u[2] = src[0].u[2] * src[1].u[2];
2401   dst->u[3] = src[0].u[3] * src[1].u[3];
2402}
2403
2404static void
2405micro_useq(union tgsi_exec_channel *dst,
2406           const union tgsi_exec_channel *src)
2407{
2408   dst->u[0] = src[0].u[0] == src[1].u[0] ? ~0 : 0;
2409   dst->u[1] = src[0].u[1] == src[1].u[1] ? ~0 : 0;
2410   dst->u[2] = src[0].u[2] == src[1].u[2] ? ~0 : 0;
2411   dst->u[3] = src[0].u[3] == src[1].u[3] ? ~0 : 0;
2412}
2413
2414static void
2415micro_usge(union tgsi_exec_channel *dst,
2416           const union tgsi_exec_channel *src)
2417{
2418   dst->u[0] = src[0].u[0] >= src[1].u[0] ? ~0 : 0;
2419   dst->u[1] = src[0].u[1] >= src[1].u[1] ? ~0 : 0;
2420   dst->u[2] = src[0].u[2] >= src[1].u[2] ? ~0 : 0;
2421   dst->u[3] = src[0].u[3] >= src[1].u[3] ? ~0 : 0;
2422}
2423
2424static void
2425micro_ushr(union tgsi_exec_channel *dst,
2426           const union tgsi_exec_channel *src)
2427{
2428   dst->u[0] = src[0].u[0] >> src[1].u[0];
2429   dst->u[1] = src[0].u[1] >> src[1].u[1];
2430   dst->u[2] = src[0].u[2] >> src[1].u[2];
2431   dst->u[3] = src[0].u[3] >> src[1].u[3];
2432}
2433
2434static void
2435micro_uslt(union tgsi_exec_channel *dst,
2436           const union tgsi_exec_channel *src)
2437{
2438   dst->u[0] = src[0].u[0] < src[1].u[0] ? ~0 : 0;
2439   dst->u[1] = src[0].u[1] < src[1].u[1] ? ~0 : 0;
2440   dst->u[2] = src[0].u[2] < src[1].u[2] ? ~0 : 0;
2441   dst->u[3] = src[0].u[3] < src[1].u[3] ? ~0 : 0;
2442}
2443
2444static void
2445micro_usne(union tgsi_exec_channel *dst,
2446           const union tgsi_exec_channel *src)
2447{
2448   dst->u[0] = src[0].u[0] != src[1].u[0] ? ~0 : 0;
2449   dst->u[1] = src[0].u[1] != src[1].u[1] ? ~0 : 0;
2450   dst->u[2] = src[0].u[2] != src[1].u[2] ? ~0 : 0;
2451   dst->u[3] = src[0].u[3] != src[1].u[3] ? ~0 : 0;
2452}
2453
2454static void
2455exec_instruction(
2456   struct tgsi_exec_machine *mach,
2457   const struct tgsi_full_instruction *inst,
2458   int *pc )
2459{
2460   uint chan_index;
2461   union tgsi_exec_channel r[10];
2462   union tgsi_exec_channel d[8];
2463
2464   (*pc)++;
2465
2466   switch (inst->Instruction.Opcode) {
2467   case TGSI_OPCODE_ARL:
2468      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
2469      break;
2470
2471   case TGSI_OPCODE_MOV:
2472      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
2473      break;
2474
2475   case TGSI_OPCODE_LIT:
2476      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2477         FETCH( &r[0], 0, CHAN_X );
2478         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2479            micro_max(&d[CHAN_Y], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2480         }
2481
2482         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2483            FETCH( &r[1], 0, CHAN_Y );
2484            micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2485
2486            FETCH( &r[2], 0, CHAN_W );
2487            micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
2488            micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
2489            micro_pow( &r[1], &r[1], &r[2] );
2490            micro_lt(&d[CHAN_Z], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2491         }
2492
2493         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2494            STORE(&d[CHAN_Y], 0, CHAN_Y);
2495         }
2496         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2497            STORE(&d[CHAN_Z], 0, CHAN_Z);
2498         }
2499      }
2500      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2501         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2502      }
2503      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2504         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2505      }
2506      break;
2507
2508   case TGSI_OPCODE_RCP:
2509      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2510      break;
2511
2512   case TGSI_OPCODE_RSQ:
2513      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2514      break;
2515
2516   case TGSI_OPCODE_EXP:
2517      FETCH( &r[0], 0, CHAN_X );
2518      micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
2519      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2520         micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
2521         STORE( &r[2], 0, CHAN_X );        /* store r2 */
2522      }
2523      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2524         micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
2525         STORE( &r[2], 0, CHAN_Y );        /* store r2 */
2526      }
2527      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2528         micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
2529         STORE( &r[2], 0, CHAN_Z );        /* store r2 */
2530      }
2531      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2532         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2533      }
2534      break;
2535
2536   case TGSI_OPCODE_LOG:
2537      FETCH( &r[0], 0, CHAN_X );
2538      micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
2539      micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
2540      micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
2541      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2542         STORE( &r[0], 0, CHAN_X );
2543      }
2544      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2545         micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
2546         micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
2547         STORE( &r[0], 0, CHAN_Y );
2548      }
2549      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2550         STORE( &r[1], 0, CHAN_Z );
2551      }
2552      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2553         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2554      }
2555      break;
2556
2557   case TGSI_OPCODE_MUL:
2558      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2559         FETCH(&r[0], 0, chan_index);
2560         FETCH(&r[1], 1, chan_index);
2561         micro_mul(&d[chan_index], &r[0], &r[1]);
2562      }
2563      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2564         STORE(&d[chan_index], 0, chan_index);
2565      }
2566      break;
2567
2568   case TGSI_OPCODE_ADD:
2569      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2570         FETCH( &r[0], 0, chan_index );
2571         FETCH( &r[1], 1, chan_index );
2572         micro_add(&d[chan_index], &r[0], &r[1]);
2573      }
2574      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2575         STORE(&d[chan_index], 0, chan_index);
2576      }
2577      break;
2578
2579   case TGSI_OPCODE_DP3:
2580      exec_dp3(mach, inst);
2581      break;
2582
2583   case TGSI_OPCODE_DP4:
2584      exec_dp4(mach, inst);
2585      break;
2586
2587   case TGSI_OPCODE_DST:
2588      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2589         FETCH( &r[0], 0, CHAN_Y );
2590         FETCH( &r[1], 1, CHAN_Y);
2591         micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2592      }
2593      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2594         FETCH(&d[CHAN_Z], 0, CHAN_Z);
2595      }
2596      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2597         FETCH(&d[CHAN_W], 1, CHAN_W);
2598      }
2599
2600      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2601         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X);
2602      }
2603      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2604         STORE(&d[CHAN_Y], 0, CHAN_Y);
2605      }
2606      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2607         STORE(&d[CHAN_Z], 0, CHAN_Z);
2608      }
2609      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2610         STORE(&d[CHAN_W], 0, CHAN_W);
2611      }
2612      break;
2613
2614   case TGSI_OPCODE_MIN:
2615      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2616         FETCH(&r[0], 0, chan_index);
2617         FETCH(&r[1], 1, chan_index);
2618
2619         /* XXX use micro_min()?? */
2620         micro_lt(&d[chan_index], &r[0], &r[1], &r[0], &r[1]);
2621      }
2622      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2623         STORE(&d[chan_index], 0, chan_index);
2624      }
2625      break;
2626
2627   case TGSI_OPCODE_MAX:
2628      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2629         FETCH(&r[0], 0, chan_index);
2630         FETCH(&r[1], 1, chan_index);
2631
2632         /* XXX use micro_max()?? */
2633         micro_lt(&d[chan_index], &r[0], &r[1], &r[1], &r[0] );
2634      }
2635      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2636         STORE(&d[chan_index], 0, chan_index);
2637      }
2638      break;
2639
2640   case TGSI_OPCODE_SLT:
2641      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2642      break;
2643
2644   case TGSI_OPCODE_SGE:
2645      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2646      break;
2647
2648   case TGSI_OPCODE_MAD:
2649      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2650      break;
2651
2652   case TGSI_OPCODE_SUB:
2653      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2654         FETCH(&r[0], 0, chan_index);
2655         FETCH(&r[1], 1, chan_index);
2656         micro_sub(&d[chan_index], &r[0], &r[1]);
2657      }
2658      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2659         STORE(&d[chan_index], 0, chan_index);
2660      }
2661      break;
2662
2663   case TGSI_OPCODE_LRP:
2664      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2665      break;
2666
2667   case TGSI_OPCODE_CND:
2668      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2669         FETCH(&r[0], 0, chan_index);
2670         FETCH(&r[1], 1, chan_index);
2671         FETCH(&r[2], 2, chan_index);
2672         micro_lt(&d[chan_index], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]);
2673      }
2674      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2675         STORE(&d[chan_index], 0, chan_index);
2676      }
2677      break;
2678
2679   case TGSI_OPCODE_DP2A:
2680      exec_dp2a(mach, inst);
2681      break;
2682
2683   case TGSI_OPCODE_FRC:
2684      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2685      break;
2686
2687   case TGSI_OPCODE_CLAMP:
2688      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2689         FETCH(&r[0], 0, chan_index);
2690         FETCH(&r[1], 1, chan_index);
2691         micro_max(&r[0], &r[0], &r[1]);
2692         FETCH(&r[1], 2, chan_index);
2693         micro_min(&d[chan_index], &r[0], &r[1]);
2694      }
2695      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2696         STORE(&d[chan_index], 0, chan_index);
2697      }
2698      break;
2699
2700   case TGSI_OPCODE_FLR:
2701      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2702      break;
2703
2704   case TGSI_OPCODE_ROUND:
2705      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2706      break;
2707
2708   case TGSI_OPCODE_EX2:
2709      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2710      break;
2711
2712   case TGSI_OPCODE_LG2:
2713      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2714      break;
2715
2716   case TGSI_OPCODE_POW:
2717      FETCH(&r[0], 0, CHAN_X);
2718      FETCH(&r[1], 1, CHAN_X);
2719
2720      micro_pow( &r[0], &r[0], &r[1] );
2721
2722      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2723         STORE( &r[0], 0, chan_index );
2724      }
2725      break;
2726
2727   case TGSI_OPCODE_XPD:
2728      FETCH(&r[0], 0, CHAN_Y);
2729      FETCH(&r[1], 1, CHAN_Z);
2730
2731      micro_mul( &r[2], &r[0], &r[1] );
2732
2733      FETCH(&r[3], 0, CHAN_Z);
2734      FETCH(&r[4], 1, CHAN_Y);
2735
2736      micro_mul( &r[5], &r[3], &r[4] );
2737      micro_sub(&d[CHAN_X], &r[2], &r[5]);
2738
2739      FETCH(&r[2], 1, CHAN_X);
2740
2741      micro_mul( &r[3], &r[3], &r[2] );
2742
2743      FETCH(&r[5], 0, CHAN_X);
2744
2745      micro_mul( &r[1], &r[1], &r[5] );
2746      micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2747
2748      micro_mul( &r[5], &r[5], &r[4] );
2749      micro_mul( &r[0], &r[0], &r[2] );
2750      micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2751
2752      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2753         STORE(&d[CHAN_X], 0, CHAN_X);
2754      }
2755      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2756         STORE(&d[CHAN_Y], 0, CHAN_Y);
2757      }
2758      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2759         STORE(&d[CHAN_Z], 0, CHAN_Z);
2760      }
2761      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2762         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2763      }
2764      break;
2765
2766   case TGSI_OPCODE_ABS:
2767      exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2768      break;
2769
2770   case TGSI_OPCODE_RCC:
2771      FETCH(&r[0], 0, CHAN_X);
2772      micro_div(&r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0]);
2773      micro_float_clamp(&r[0], &r[0]);
2774      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2775         STORE(&r[0], 0, chan_index);
2776      }
2777      break;
2778
2779   case TGSI_OPCODE_DPH:
2780      exec_dph(mach, inst);
2781      break;
2782
2783   case TGSI_OPCODE_COS:
2784      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2785      break;
2786
2787   case TGSI_OPCODE_DDX:
2788      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2789      break;
2790
2791   case TGSI_OPCODE_DDY:
2792      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2793      break;
2794
2795   case TGSI_OPCODE_KILP:
2796      exec_kilp (mach, inst);
2797      break;
2798
2799   case TGSI_OPCODE_KIL:
2800      exec_kil (mach, inst);
2801      break;
2802
2803   case TGSI_OPCODE_PK2H:
2804      assert (0);
2805      break;
2806
2807   case TGSI_OPCODE_PK2US:
2808      assert (0);
2809      break;
2810
2811   case TGSI_OPCODE_PK4B:
2812      assert (0);
2813      break;
2814
2815   case TGSI_OPCODE_PK4UB:
2816      assert (0);
2817      break;
2818
2819   case TGSI_OPCODE_RFL:
2820      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2821          IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2822          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2823         /* r0 = dp3(src0, src0) */
2824         FETCH(&r[2], 0, CHAN_X);
2825         micro_mul(&r[0], &r[2], &r[2]);
2826         FETCH(&r[4], 0, CHAN_Y);
2827         micro_mul(&r[8], &r[4], &r[4]);
2828         micro_add(&r[0], &r[0], &r[8]);
2829         FETCH(&r[6], 0, CHAN_Z);
2830         micro_mul(&r[8], &r[6], &r[6]);
2831         micro_add(&r[0], &r[0], &r[8]);
2832
2833         /* r1 = dp3(src0, src1) */
2834         FETCH(&r[3], 1, CHAN_X);
2835         micro_mul(&r[1], &r[2], &r[3]);
2836         FETCH(&r[5], 1, CHAN_Y);
2837         micro_mul(&r[8], &r[4], &r[5]);
2838         micro_add(&r[1], &r[1], &r[8]);
2839         FETCH(&r[7], 1, CHAN_Z);
2840         micro_mul(&r[8], &r[6], &r[7]);
2841         micro_add(&r[1], &r[1], &r[8]);
2842
2843         /* r1 = 2 * r1 / r0 */
2844         micro_add(&r[1], &r[1], &r[1]);
2845         micro_div(&r[1], &r[1], &r[0]);
2846
2847         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2848            micro_mul(&r[2], &r[2], &r[1]);
2849            micro_sub(&r[2], &r[2], &r[3]);
2850            STORE(&r[2], 0, CHAN_X);
2851         }
2852         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2853            micro_mul(&r[4], &r[4], &r[1]);
2854            micro_sub(&r[4], &r[4], &r[5]);
2855            STORE(&r[4], 0, CHAN_Y);
2856         }
2857         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2858            micro_mul(&r[6], &r[6], &r[1]);
2859            micro_sub(&r[6], &r[6], &r[7]);
2860            STORE(&r[6], 0, CHAN_Z);
2861         }
2862      }
2863      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2864         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2865      }
2866      break;
2867
2868   case TGSI_OPCODE_SEQ:
2869      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2870      break;
2871
2872   case TGSI_OPCODE_SFL:
2873      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2874         STORE(&mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, chan_index);
2875      }
2876      break;
2877
2878   case TGSI_OPCODE_SGT:
2879      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2880      break;
2881
2882   case TGSI_OPCODE_SIN:
2883      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2884      break;
2885
2886   case TGSI_OPCODE_SLE:
2887      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2888      break;
2889
2890   case TGSI_OPCODE_SNE:
2891      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2892      break;
2893
2894   case TGSI_OPCODE_STR:
2895      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2896         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, chan_index);
2897      }
2898      break;
2899
2900   case TGSI_OPCODE_TEX:
2901      /* simple texture lookup */
2902      /* src[0] = texcoord */
2903      /* src[1] = sampler unit */
2904      exec_tex(mach, inst, TEX_MODIFIER_NONE);
2905      break;
2906
2907   case TGSI_OPCODE_TXB:
2908      /* Texture lookup with lod bias */
2909      /* src[0] = texcoord (src[0].w = LOD bias) */
2910      /* src[1] = sampler unit */
2911      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
2912      break;
2913
2914   case TGSI_OPCODE_TXD:
2915      /* Texture lookup with explict partial derivatives */
2916      /* src[0] = texcoord */
2917      /* src[1] = d[strq]/dx */
2918      /* src[2] = d[strq]/dy */
2919      /* src[3] = sampler unit */
2920      exec_txd(mach, inst);
2921      break;
2922
2923   case TGSI_OPCODE_TXL:
2924      /* Texture lookup with explit LOD */
2925      /* src[0] = texcoord (src[0].w = LOD) */
2926      /* src[1] = sampler unit */
2927      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
2928      break;
2929
2930   case TGSI_OPCODE_TXP:
2931      /* Texture lookup with projection */
2932      /* src[0] = texcoord (src[0].w = projection) */
2933      /* src[1] = sampler unit */
2934      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
2935      break;
2936
2937   case TGSI_OPCODE_UP2H:
2938      assert (0);
2939      break;
2940
2941   case TGSI_OPCODE_UP2US:
2942      assert (0);
2943      break;
2944
2945   case TGSI_OPCODE_UP4B:
2946      assert (0);
2947      break;
2948
2949   case TGSI_OPCODE_UP4UB:
2950      assert (0);
2951      break;
2952
2953   case TGSI_OPCODE_X2D:
2954      FETCH(&r[0], 1, CHAN_X);
2955      FETCH(&r[1], 1, CHAN_Y);
2956      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2957          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2958         FETCH(&r[2], 2, CHAN_X);
2959         micro_mul(&r[2], &r[2], &r[0]);
2960         FETCH(&r[3], 2, CHAN_Y);
2961         micro_mul(&r[3], &r[3], &r[1]);
2962         micro_add(&r[2], &r[2], &r[3]);
2963         FETCH(&r[3], 0, CHAN_X);
2964         micro_add(&d[CHAN_X], &r[2], &r[3]);
2965
2966      }
2967      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2968          IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2969         FETCH(&r[2], 2, CHAN_Z);
2970         micro_mul(&r[2], &r[2], &r[0]);
2971         FETCH(&r[3], 2, CHAN_W);
2972         micro_mul(&r[3], &r[3], &r[1]);
2973         micro_add(&r[2], &r[2], &r[3]);
2974         FETCH(&r[3], 0, CHAN_Y);
2975         micro_add(&d[CHAN_Y], &r[2], &r[3]);
2976
2977      }
2978      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2979         STORE(&d[CHAN_X], 0, CHAN_X);
2980      }
2981      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2982         STORE(&d[CHAN_Y], 0, CHAN_Y);
2983      }
2984      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2985         STORE(&d[CHAN_X], 0, CHAN_Z);
2986      }
2987      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2988         STORE(&d[CHAN_Y], 0, CHAN_W);
2989      }
2990      break;
2991
2992   case TGSI_OPCODE_ARA:
2993      assert (0);
2994      break;
2995
2996   case TGSI_OPCODE_ARR:
2997      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
2998      break;
2999
3000   case TGSI_OPCODE_BRA:
3001      assert (0);
3002      break;
3003
3004   case TGSI_OPCODE_CAL:
3005      /* skip the call if no execution channels are enabled */
3006      if (mach->ExecMask) {
3007         /* do the call */
3008
3009         /* First, record the depths of the execution stacks.
3010          * This is important for deeply nested/looped return statements.
3011          * We have to unwind the stacks by the correct amount.  For a
3012          * real code generator, we could determine the number of entries
3013          * to pop off each stack with simple static analysis and avoid
3014          * implementing this data structure at run time.
3015          */
3016         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
3017         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
3018         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
3019         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
3020         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
3021         /* note that PC was already incremented above */
3022         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
3023
3024         mach->CallStackTop++;
3025
3026         /* Second, push the Cond, Loop, Cont, Func stacks */
3027         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3028         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3029         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3030         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3031         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3032         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
3033
3034         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3035         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3036         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3037         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3038         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3039         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
3040
3041         /* Finally, jump to the subroutine */
3042         *pc = inst->Label.Label;
3043      }
3044      break;
3045
3046   case TGSI_OPCODE_RET:
3047      mach->FuncMask &= ~mach->ExecMask;
3048      UPDATE_EXEC_MASK(mach);
3049
3050      if (mach->FuncMask == 0x0) {
3051         /* really return now (otherwise, keep executing */
3052
3053         if (mach->CallStackTop == 0) {
3054            /* returning from main() */
3055            *pc = -1;
3056            return;
3057         }
3058
3059         assert(mach->CallStackTop > 0);
3060         mach->CallStackTop--;
3061
3062         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3063         mach->CondMask = mach->CondStack[mach->CondStackTop];
3064
3065         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3066         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3067
3068         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3069         mach->ContMask = mach->ContStack[mach->ContStackTop];
3070
3071         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3072         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3073
3074         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3075         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3076
3077         assert(mach->FuncStackTop > 0);
3078         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3079
3080         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3081
3082         UPDATE_EXEC_MASK(mach);
3083      }
3084      break;
3085
3086   case TGSI_OPCODE_SSG:
3087      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3088      break;
3089
3090   case TGSI_OPCODE_CMP:
3091      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3092         FETCH(&r[0], 0, chan_index);
3093         FETCH(&r[1], 1, chan_index);
3094         FETCH(&r[2], 2, chan_index);
3095         micro_lt(&d[chan_index], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2]);
3096      }
3097      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3098         STORE(&d[chan_index], 0, chan_index);
3099      }
3100      break;
3101
3102   case TGSI_OPCODE_SCS:
3103      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
3104         FETCH( &r[0], 0, CHAN_X );
3105         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
3106            micro_cos(&r[1], &r[0]);
3107            STORE(&r[1], 0, CHAN_X);
3108         }
3109         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
3110            micro_sin(&r[1], &r[0]);
3111            STORE(&r[1], 0, CHAN_Y);
3112         }
3113      }
3114      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
3115         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
3116      }
3117      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
3118         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
3119      }
3120      break;
3121
3122   case TGSI_OPCODE_NRM:
3123      exec_nrm3(mach, inst);
3124      break;
3125
3126   case TGSI_OPCODE_NRM4:
3127      exec_nrm4(mach, inst);
3128      break;
3129
3130   case TGSI_OPCODE_DIV:
3131      assert( 0 );
3132      break;
3133
3134   case TGSI_OPCODE_DP2:
3135      exec_dp2(mach, inst);
3136      break;
3137
3138   case TGSI_OPCODE_IF:
3139      /* push CondMask */
3140      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3141      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3142      FETCH( &r[0], 0, CHAN_X );
3143      /* update CondMask */
3144      if( ! r[0].u[0] ) {
3145         mach->CondMask &= ~0x1;
3146      }
3147      if( ! r[0].u[1] ) {
3148         mach->CondMask &= ~0x2;
3149      }
3150      if( ! r[0].u[2] ) {
3151         mach->CondMask &= ~0x4;
3152      }
3153      if( ! r[0].u[3] ) {
3154         mach->CondMask &= ~0x8;
3155      }
3156      UPDATE_EXEC_MASK(mach);
3157      /* Todo: If CondMask==0, jump to ELSE */
3158      break;
3159
3160   case TGSI_OPCODE_ELSE:
3161      /* invert CondMask wrt previous mask */
3162      {
3163         uint prevMask;
3164         assert(mach->CondStackTop > 0);
3165         prevMask = mach->CondStack[mach->CondStackTop - 1];
3166         mach->CondMask = ~mach->CondMask & prevMask;
3167         UPDATE_EXEC_MASK(mach);
3168         /* Todo: If CondMask==0, jump to ENDIF */
3169      }
3170      break;
3171
3172   case TGSI_OPCODE_ENDIF:
3173      /* pop CondMask */
3174      assert(mach->CondStackTop > 0);
3175      mach->CondMask = mach->CondStack[--mach->CondStackTop];
3176      UPDATE_EXEC_MASK(mach);
3177      break;
3178
3179   case TGSI_OPCODE_END:
3180      /* halt execution */
3181      *pc = -1;
3182      break;
3183
3184   case TGSI_OPCODE_REP:
3185      assert (0);
3186      break;
3187
3188   case TGSI_OPCODE_ENDREP:
3189       assert (0);
3190       break;
3191
3192   case TGSI_OPCODE_PUSHA:
3193      assert (0);
3194      break;
3195
3196   case TGSI_OPCODE_POPA:
3197      assert (0);
3198      break;
3199
3200   case TGSI_OPCODE_CEIL:
3201      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3202      break;
3203
3204   case TGSI_OPCODE_I2F:
3205      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
3206      break;
3207
3208   case TGSI_OPCODE_NOT:
3209      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3210      break;
3211
3212   case TGSI_OPCODE_TRUNC:
3213      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3214      break;
3215
3216   case TGSI_OPCODE_SHL:
3217      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3218      break;
3219
3220   case TGSI_OPCODE_AND:
3221      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3222      break;
3223
3224   case TGSI_OPCODE_OR:
3225      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3226      break;
3227
3228   case TGSI_OPCODE_MOD:
3229      assert (0);
3230      break;
3231
3232   case TGSI_OPCODE_XOR:
3233      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3234      break;
3235
3236   case TGSI_OPCODE_SAD:
3237      assert (0);
3238      break;
3239
3240   case TGSI_OPCODE_TXF:
3241      assert (0);
3242      break;
3243
3244   case TGSI_OPCODE_TXQ:
3245      assert (0);
3246      break;
3247
3248   case TGSI_OPCODE_EMIT:
3249      emit_vertex(mach);
3250      break;
3251
3252   case TGSI_OPCODE_ENDPRIM:
3253      emit_primitive(mach);
3254      break;
3255
3256   case TGSI_OPCODE_BGNFOR:
3257      assert(mach->LoopCounterStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3258      for (chan_index = 0; chan_index < 3; chan_index++) {
3259         FETCH( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[chan_index], 0, chan_index );
3260      }
3261      ++mach->LoopCounterStackTop;
3262      STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X], 0, CHAN_X);
3263      /* update LoopMask */
3264      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3265         mach->LoopMask &= ~0x1;
3266      }
3267      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3268         mach->LoopMask &= ~0x2;
3269      }
3270      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3271         mach->LoopMask &= ~0x4;
3272      }
3273      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3274         mach->LoopMask &= ~0x8;
3275      }
3276      /* TODO: if mach->LoopMask == 0, jump to end of loop */
3277      UPDATE_EXEC_MASK(mach);
3278      /* fall-through (for now) */
3279   case TGSI_OPCODE_BGNLOOP:
3280      /* push LoopMask and ContMasks */
3281      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3282      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3283      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3284      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3285
3286      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3287      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3288      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3289      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3290      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
3291      break;
3292
3293   case TGSI_OPCODE_ENDFOR:
3294      assert(mach->LoopCounterStackTop > 0);
3295      micro_sub(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3296                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3297                &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
3298      /* update LoopMask */
3299      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3300         mach->LoopMask &= ~0x1;
3301      }
3302      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3303         mach->LoopMask &= ~0x2;
3304      }
3305      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3306         mach->LoopMask &= ~0x4;
3307      }
3308      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3309         mach->LoopMask &= ~0x8;
3310      }
3311      micro_add(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3312                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3313                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Z]);
3314      assert(mach->LoopLabelStackTop > 0);
3315      inst = mach->Instructions + mach->LoopLabelStack[mach->LoopLabelStackTop - 1];
3316      STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_X], 0, CHAN_X);
3317      /* Restore ContMask, but don't pop */
3318      assert(mach->ContStackTop > 0);
3319      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3320      UPDATE_EXEC_MASK(mach);
3321      if (mach->ExecMask) {
3322         /* repeat loop: jump to instruction just past BGNLOOP */
3323         assert(mach->LoopLabelStackTop > 0);
3324         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3325      }
3326      else {
3327         /* exit loop: pop LoopMask */
3328         assert(mach->LoopStackTop > 0);
3329         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3330         /* pop ContMask */
3331         assert(mach->ContStackTop > 0);
3332         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3333         assert(mach->LoopLabelStackTop > 0);
3334         --mach->LoopLabelStackTop;
3335         assert(mach->LoopCounterStackTop > 0);
3336         --mach->LoopCounterStackTop;
3337
3338         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3339      }
3340      UPDATE_EXEC_MASK(mach);
3341      break;
3342
3343   case TGSI_OPCODE_ENDLOOP:
3344      /* Restore ContMask, but don't pop */
3345      assert(mach->ContStackTop > 0);
3346      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3347      UPDATE_EXEC_MASK(mach);
3348      if (mach->ExecMask) {
3349         /* repeat loop: jump to instruction just past BGNLOOP */
3350         assert(mach->LoopLabelStackTop > 0);
3351         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3352      }
3353      else {
3354         /* exit loop: pop LoopMask */
3355         assert(mach->LoopStackTop > 0);
3356         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3357         /* pop ContMask */
3358         assert(mach->ContStackTop > 0);
3359         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3360         assert(mach->LoopLabelStackTop > 0);
3361         --mach->LoopLabelStackTop;
3362
3363         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3364      }
3365      UPDATE_EXEC_MASK(mach);
3366      break;
3367
3368   case TGSI_OPCODE_BRK:
3369      exec_break(mach);
3370      break;
3371
3372   case TGSI_OPCODE_CONT:
3373      /* turn off cont channels for each enabled exec channel */
3374      mach->ContMask &= ~mach->ExecMask;
3375      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3376      UPDATE_EXEC_MASK(mach);
3377      break;
3378
3379   case TGSI_OPCODE_BGNSUB:
3380      /* no-op */
3381      break;
3382
3383   case TGSI_OPCODE_ENDSUB:
3384      /*
3385       * XXX: This really should be a no-op. We should never reach this opcode.
3386       */
3387
3388      assert(mach->CallStackTop > 0);
3389      mach->CallStackTop--;
3390
3391      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3392      mach->CondMask = mach->CondStack[mach->CondStackTop];
3393
3394      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3395      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3396
3397      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3398      mach->ContMask = mach->ContStack[mach->ContStackTop];
3399
3400      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3401      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3402
3403      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3404      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3405
3406      assert(mach->FuncStackTop > 0);
3407      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3408
3409      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3410
3411      UPDATE_EXEC_MASK(mach);
3412      break;
3413
3414   case TGSI_OPCODE_NOP:
3415      break;
3416
3417   case TGSI_OPCODE_BREAKC:
3418      FETCH(&r[0], 0, CHAN_X);
3419      /* update CondMask */
3420      if (r[0].u[0] && (mach->ExecMask & 0x1)) {
3421         mach->LoopMask &= ~0x1;
3422      }
3423      if (r[0].u[1] && (mach->ExecMask & 0x2)) {
3424         mach->LoopMask &= ~0x2;
3425      }
3426      if (r[0].u[2] && (mach->ExecMask & 0x4)) {
3427         mach->LoopMask &= ~0x4;
3428      }
3429      if (r[0].u[3] && (mach->ExecMask & 0x8)) {
3430         mach->LoopMask &= ~0x8;
3431      }
3432      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3433      UPDATE_EXEC_MASK(mach);
3434      break;
3435
3436   case TGSI_OPCODE_F2I:
3437      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3438      break;
3439
3440   case TGSI_OPCODE_IDIV:
3441      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3442      break;
3443
3444   case TGSI_OPCODE_IMAX:
3445      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3446      break;
3447
3448   case TGSI_OPCODE_IMIN:
3449      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3450      break;
3451
3452   case TGSI_OPCODE_INEG:
3453      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3454      break;
3455
3456   case TGSI_OPCODE_ISGE:
3457      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3458      break;
3459
3460   case TGSI_OPCODE_ISHR:
3461      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3462      break;
3463
3464   case TGSI_OPCODE_ISLT:
3465      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3466      break;
3467
3468   case TGSI_OPCODE_F2U:
3469      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3470      break;
3471
3472   case TGSI_OPCODE_U2F:
3473      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
3474      break;
3475
3476   case TGSI_OPCODE_UADD:
3477      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3478      break;
3479
3480   case TGSI_OPCODE_UDIV:
3481      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3482      break;
3483
3484   case TGSI_OPCODE_UMAD:
3485      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3486      break;
3487
3488   case TGSI_OPCODE_UMAX:
3489      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3490      break;
3491
3492   case TGSI_OPCODE_UMIN:
3493      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3494      break;
3495
3496   case TGSI_OPCODE_UMOD:
3497      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3498      break;
3499
3500   case TGSI_OPCODE_UMUL:
3501      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3502      break;
3503
3504   case TGSI_OPCODE_USEQ:
3505      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3506      break;
3507
3508   case TGSI_OPCODE_USGE:
3509      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3510      break;
3511
3512   case TGSI_OPCODE_USHR:
3513      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3514      break;
3515
3516   case TGSI_OPCODE_USLT:
3517      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3518      break;
3519
3520   case TGSI_OPCODE_USNE:
3521      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3522      break;
3523
3524   case TGSI_OPCODE_SWITCH:
3525      exec_switch(mach, inst);
3526      break;
3527
3528   case TGSI_OPCODE_CASE:
3529      exec_case(mach, inst);
3530      break;
3531
3532   case TGSI_OPCODE_DEFAULT:
3533      exec_default(mach);
3534      break;
3535
3536   case TGSI_OPCODE_ENDSWITCH:
3537      exec_endswitch(mach);
3538      break;
3539
3540   default:
3541      assert( 0 );
3542   }
3543}
3544
3545
3546#define DEBUG_EXECUTION 0
3547
3548
3549/**
3550 * Run TGSI interpreter.
3551 * \return bitmask of "alive" quad components
3552 */
3553uint
3554tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
3555{
3556   uint i;
3557   int pc = 0;
3558
3559   mach->CondMask = 0xf;
3560   mach->LoopMask = 0xf;
3561   mach->ContMask = 0xf;
3562   mach->FuncMask = 0xf;
3563   mach->ExecMask = 0xf;
3564
3565   mach->Switch.mask = 0xf;
3566
3567   assert(mach->CondStackTop == 0);
3568   assert(mach->LoopStackTop == 0);
3569   assert(mach->ContStackTop == 0);
3570   assert(mach->SwitchStackTop == 0);
3571   assert(mach->BreakStackTop == 0);
3572   assert(mach->CallStackTop == 0);
3573
3574   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
3575   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
3576
3577   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
3578      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
3579      mach->Primitives[0] = 0;
3580   }
3581
3582   for (i = 0; i < QUAD_SIZE; i++) {
3583      mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
3584         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
3585         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
3586         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
3587         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
3588   }
3589
3590   /* execute declarations (interpolants) */
3591   for (i = 0; i < mach->NumDeclarations; i++) {
3592      exec_declaration( mach, mach->Declarations+i );
3593   }
3594
3595   {
3596#if DEBUG_EXECUTION
3597      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
3598      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
3599      uint inst = 1;
3600
3601      memcpy(temps, mach->Temps, sizeof(temps));
3602      memcpy(outputs, mach->Outputs, sizeof(outputs));
3603#endif
3604
3605      /* execute instructions, until pc is set to -1 */
3606      while (pc != -1) {
3607
3608#if DEBUG_EXECUTION
3609         uint i;
3610
3611         tgsi_dump_instruction(&mach->Instructions[pc], inst++);
3612#endif
3613
3614         assert(pc < (int) mach->NumInstructions);
3615         exec_instruction(mach, mach->Instructions + pc, &pc);
3616
3617#if DEBUG_EXECUTION
3618         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
3619            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
3620               uint j;
3621
3622               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
3623               debug_printf("TEMP[%2u] = ", i);
3624               for (j = 0; j < 4; j++) {
3625                  if (j > 0) {
3626                     debug_printf("           ");
3627                  }
3628                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3629                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
3630                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
3631                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
3632                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
3633               }
3634            }
3635         }
3636         for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
3637            if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
3638               uint j;
3639
3640               memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
3641               debug_printf("OUT[%2u] =  ", i);
3642               for (j = 0; j < 4; j++) {
3643                  if (j > 0) {
3644                     debug_printf("           ");
3645                  }
3646                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3647                               outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
3648                               outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
3649                               outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
3650                               outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
3651               }
3652            }
3653         }
3654#endif
3655      }
3656   }
3657
3658#if 0
3659   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
3660   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
3661      /*
3662       * Scale back depth component.
3663       */
3664      for (i = 0; i < 4; i++)
3665         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
3666   }
3667#endif
3668
3669   assert(mach->CondStackTop == 0);
3670   assert(mach->LoopStackTop == 0);
3671   assert(mach->ContStackTop == 0);
3672   assert(mach->SwitchStackTop == 0);
3673   assert(mach->BreakStackTop == 0);
3674   assert(mach->CallStackTop == 0);
3675
3676   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3677}
3678