tgsi_exec.c revision 89d8577fb3036547ef0b47498cc8dc5c77f886e0
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28/**
29 * TGSI interpreter/executor.
30 *
31 * Flow control information:
32 *
33 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
34 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
35 * care since a condition may be true for some quad components but false
36 * for other components.
37 *
38 * We basically execute all statements (even if they're in the part of
39 * an IF/ELSE clause that's "not taken") and use a special mask to
40 * control writing to destination registers.  This is the ExecMask.
41 * See store_dest().
42 *
43 * The ExecMask is computed from three other masks (CondMask, LoopMask and
44 * ContMask) which are controlled by the flow control instructions (namely:
45 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
46 *
47 *
48 * Authors:
49 *   Michal Krol
50 *   Brian Paul
51 */
52
53#include "pipe/p_compiler.h"
54#include "pipe/p_state.h"
55#include "pipe/p_shader_tokens.h"
56#include "tgsi/tgsi_dump.h"
57#include "tgsi/tgsi_parse.h"
58#include "tgsi/tgsi_util.h"
59#include "tgsi_exec.h"
60#include "util/u_memory.h"
61#include "util/u_math.h"
62
63#define FAST_MATH 1
64
65#define TILE_TOP_LEFT     0
66#define TILE_TOP_RIGHT    1
67#define TILE_BOTTOM_LEFT  2
68#define TILE_BOTTOM_RIGHT 3
69
70#define CHAN_X  0
71#define CHAN_Y  1
72#define CHAN_Z  2
73#define CHAN_W  3
74
75/*
76 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
77 */
78#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
79#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
80#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
81#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
82#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
83#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
84#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
85#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
86#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
87#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
88#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
89#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
90#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
91#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
92#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
93#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
94#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
95#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
96#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
97#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
98#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
99#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
100#define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
101#define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
102#define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
103#define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
104#define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
105#define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
106#define TEMP_R0            TGSI_EXEC_TEMP_R0
107#define TEMP_P0            TGSI_EXEC_TEMP_P0
108
109#define IS_CHANNEL_ENABLED(INST, CHAN)\
110   ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
111
112#define IS_CHANNEL_ENABLED2(INST, CHAN)\
113   ((INST).Dst[1].Register.WriteMask & (1 << (CHAN)))
114
115#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
116   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
117      if (IS_CHANNEL_ENABLED( INST, CHAN ))
118
119#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
120   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
121      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
122
123
124/** The execution mask depends on the conditional mask and the loop mask */
125#define UPDATE_EXEC_MASK(MACH) \
126      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
127
128
129static const union tgsi_exec_channel ZeroVec =
130   { { 0.0, 0.0, 0.0, 0.0 } };
131
132
133#ifdef DEBUG
134static void
135check_inf_or_nan(const union tgsi_exec_channel *chan)
136{
137   assert(!util_is_inf_or_nan(chan->f[0]));
138   assert(!util_is_inf_or_nan(chan->f[1]));
139   assert(!util_is_inf_or_nan(chan->f[2]));
140   assert(!util_is_inf_or_nan(chan->f[3]));
141}
142#endif
143
144
145#ifdef DEBUG
146static void
147print_chan(const char *msg, const union tgsi_exec_channel *chan)
148{
149   debug_printf("%s = {%f, %f, %f, %f}\n",
150                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
151}
152#endif
153
154
155#ifdef DEBUG
156static void
157print_temp(const struct tgsi_exec_machine *mach, uint index)
158{
159   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
160   int i;
161   debug_printf("Temp[%u] =\n", index);
162   for (i = 0; i < 4; i++) {
163      debug_printf("  %c: { %f, %f, %f, %f }\n",
164                   "XYZW"[i],
165                   tmp->xyzw[i].f[0],
166                   tmp->xyzw[i].f[1],
167                   tmp->xyzw[i].f[2],
168                   tmp->xyzw[i].f[3]);
169   }
170}
171#endif
172
173
174/**
175 * Check if there's a potential src/dst register data dependency when
176 * using SOA execution.
177 * Example:
178 *   MOV T, T.yxwz;
179 * This would expand into:
180 *   MOV t0, t1;
181 *   MOV t1, t0;
182 *   MOV t2, t3;
183 *   MOV t3, t2;
184 * The second instruction will have the wrong value for t0 if executed as-is.
185 */
186boolean
187tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
188{
189   uint i, chan;
190
191   uint writemask = inst->Dst[0].Register.WriteMask;
192   if (writemask == TGSI_WRITEMASK_X ||
193       writemask == TGSI_WRITEMASK_Y ||
194       writemask == TGSI_WRITEMASK_Z ||
195       writemask == TGSI_WRITEMASK_W ||
196       writemask == TGSI_WRITEMASK_NONE) {
197      /* no chance of data dependency */
198      return FALSE;
199   }
200
201   /* loop over src regs */
202   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
203      if ((inst->Src[i].Register.File ==
204           inst->Dst[0].Register.File) &&
205          (inst->Src[i].Register.Index ==
206           inst->Dst[0].Register.Index)) {
207         /* loop over dest channels */
208         uint channelsWritten = 0x0;
209         FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
210            /* check if we're reading a channel that's been written */
211            uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
212            if (channelsWritten & (1 << swizzle)) {
213               return TRUE;
214            }
215
216            channelsWritten |= (1 << chan);
217         }
218      }
219   }
220   return FALSE;
221}
222
223
224/**
225 * Initialize machine state by expanding tokens to full instructions,
226 * allocating temporary storage, setting up constants, etc.
227 * After this, we can call tgsi_exec_machine_run() many times.
228 */
229void
230tgsi_exec_machine_bind_shader(
231   struct tgsi_exec_machine *mach,
232   const struct tgsi_token *tokens,
233   uint numSamplers,
234   struct tgsi_sampler **samplers)
235{
236   uint k;
237   struct tgsi_parse_context parse;
238   struct tgsi_exec_labels *labels = &mach->Labels;
239   struct tgsi_full_instruction *instructions;
240   struct tgsi_full_declaration *declarations;
241   uint maxInstructions = 10, numInstructions = 0;
242   uint maxDeclarations = 10, numDeclarations = 0;
243   uint instno = 0;
244
245#if 0
246   tgsi_dump(tokens, 0);
247#endif
248
249   util_init_math();
250
251   mach->Tokens = tokens;
252   mach->Samplers = samplers;
253
254   k = tgsi_parse_init (&parse, mach->Tokens);
255   if (k != TGSI_PARSE_OK) {
256      debug_printf( "Problem parsing!\n" );
257      return;
258   }
259
260   mach->Processor = parse.FullHeader.Processor.Processor;
261   mach->ImmLimit = 0;
262   labels->count = 0;
263
264   declarations = (struct tgsi_full_declaration *)
265      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
266
267   if (!declarations) {
268      return;
269   }
270
271   instructions = (struct tgsi_full_instruction *)
272      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
273
274   if (!instructions) {
275      FREE( declarations );
276      return;
277   }
278
279   while( !tgsi_parse_end_of_tokens( &parse ) ) {
280      uint pointer = parse.Position;
281      uint i;
282
283      tgsi_parse_token( &parse );
284      switch( parse.FullToken.Token.Type ) {
285      case TGSI_TOKEN_TYPE_DECLARATION:
286         /* save expanded declaration */
287         if (numDeclarations == maxDeclarations) {
288            declarations = REALLOC(declarations,
289                                   maxDeclarations
290                                   * sizeof(struct tgsi_full_declaration),
291                                   (maxDeclarations + 10)
292                                   * sizeof(struct tgsi_full_declaration));
293            maxDeclarations += 10;
294         }
295         memcpy(declarations + numDeclarations,
296                &parse.FullToken.FullDeclaration,
297                sizeof(declarations[0]));
298         numDeclarations++;
299         break;
300
301      case TGSI_TOKEN_TYPE_IMMEDIATE:
302         {
303            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
304            assert( size <= 4 );
305            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
306
307            for( i = 0; i < size; i++ ) {
308               mach->Imms[mach->ImmLimit][i] =
309		  parse.FullToken.FullImmediate.u[i].Float;
310            }
311            mach->ImmLimit += 1;
312         }
313         break;
314
315      case TGSI_TOKEN_TYPE_INSTRUCTION:
316         assert( labels->count < MAX_LABELS );
317
318         labels->labels[labels->count][0] = instno;
319         labels->labels[labels->count][1] = pointer;
320         labels->count++;
321
322         /* save expanded instruction */
323         if (numInstructions == maxInstructions) {
324            instructions = REALLOC(instructions,
325                                   maxInstructions
326                                   * sizeof(struct tgsi_full_instruction),
327                                   (maxInstructions + 10)
328                                   * sizeof(struct tgsi_full_instruction));
329            maxInstructions += 10;
330         }
331
332         memcpy(instructions + numInstructions,
333                &parse.FullToken.FullInstruction,
334                sizeof(instructions[0]));
335
336         numInstructions++;
337         break;
338
339      case TGSI_TOKEN_TYPE_PROPERTY:
340         break;
341
342      default:
343         assert( 0 );
344      }
345   }
346   tgsi_parse_free (&parse);
347
348   if (mach->Declarations) {
349      FREE( mach->Declarations );
350   }
351   mach->Declarations = declarations;
352   mach->NumDeclarations = numDeclarations;
353
354   if (mach->Instructions) {
355      FREE( mach->Instructions );
356   }
357   mach->Instructions = instructions;
358   mach->NumInstructions = numInstructions;
359}
360
361
362struct tgsi_exec_machine *
363tgsi_exec_machine_create( void )
364{
365   struct tgsi_exec_machine *mach;
366   uint i;
367
368   mach = align_malloc( sizeof *mach, 16 );
369   if (!mach)
370      goto fail;
371
372   memset(mach, 0, sizeof(*mach));
373
374   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
375   mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
376   mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
377
378   /* Setup constants. */
379   for( i = 0; i < 4; i++ ) {
380      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
381      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
382      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
383      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
384      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
385      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
386      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
387      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
388      mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
389      mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
390   }
391
392#ifdef DEBUG
393   /* silence warnings */
394   (void) print_chan;
395   (void) print_temp;
396#endif
397
398   return mach;
399
400fail:
401   align_free(mach);
402   return NULL;
403}
404
405
406void
407tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
408{
409   if (mach) {
410      FREE(mach->Instructions);
411      FREE(mach->Declarations);
412   }
413
414   align_free(mach);
415}
416
417
418static void
419micro_abs(
420   union tgsi_exec_channel *dst,
421   const union tgsi_exec_channel *src )
422{
423   dst->f[0] = fabsf( src->f[0] );
424   dst->f[1] = fabsf( src->f[1] );
425   dst->f[2] = fabsf( src->f[2] );
426   dst->f[3] = fabsf( src->f[3] );
427}
428
429static void
430micro_add(
431   union tgsi_exec_channel *dst,
432   const union tgsi_exec_channel *src0,
433   const union tgsi_exec_channel *src1 )
434{
435   dst->f[0] = src0->f[0] + src1->f[0];
436   dst->f[1] = src0->f[1] + src1->f[1];
437   dst->f[2] = src0->f[2] + src1->f[2];
438   dst->f[3] = src0->f[3] + src1->f[3];
439}
440
441#if 0
442static void
443micro_iadd(
444   union tgsi_exec_channel *dst,
445   const union tgsi_exec_channel *src0,
446   const union tgsi_exec_channel *src1 )
447{
448   dst->i[0] = src0->i[0] + src1->i[0];
449   dst->i[1] = src0->i[1] + src1->i[1];
450   dst->i[2] = src0->i[2] + src1->i[2];
451   dst->i[3] = src0->i[3] + src1->i[3];
452}
453#endif
454
455static void
456micro_and(
457   union tgsi_exec_channel *dst,
458   const union tgsi_exec_channel *src0,
459   const union tgsi_exec_channel *src1 )
460{
461   dst->u[0] = src0->u[0] & src1->u[0];
462   dst->u[1] = src0->u[1] & src1->u[1];
463   dst->u[2] = src0->u[2] & src1->u[2];
464   dst->u[3] = src0->u[3] & src1->u[3];
465}
466
467static void
468micro_ceil(
469   union tgsi_exec_channel *dst,
470   const union tgsi_exec_channel *src )
471{
472   dst->f[0] = ceilf( src->f[0] );
473   dst->f[1] = ceilf( src->f[1] );
474   dst->f[2] = ceilf( src->f[2] );
475   dst->f[3] = ceilf( src->f[3] );
476}
477
478static void
479micro_cos(
480   union tgsi_exec_channel *dst,
481   const union tgsi_exec_channel *src )
482{
483   dst->f[0] = cosf( src->f[0] );
484   dst->f[1] = cosf( src->f[1] );
485   dst->f[2] = cosf( src->f[2] );
486   dst->f[3] = cosf( src->f[3] );
487}
488
489static void
490micro_ddx(
491   union tgsi_exec_channel *dst,
492   const union tgsi_exec_channel *src )
493{
494   dst->f[0] =
495   dst->f[1] =
496   dst->f[2] =
497   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
498}
499
500static void
501micro_ddy(
502   union tgsi_exec_channel *dst,
503   const union tgsi_exec_channel *src )
504{
505   dst->f[0] =
506   dst->f[1] =
507   dst->f[2] =
508   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
509}
510
511static void
512micro_div(
513   union tgsi_exec_channel *dst,
514   const union tgsi_exec_channel *src0,
515   const union tgsi_exec_channel *src1 )
516{
517   if (src1->f[0] != 0) {
518      dst->f[0] = src0->f[0] / src1->f[0];
519   }
520   if (src1->f[1] != 0) {
521      dst->f[1] = src0->f[1] / src1->f[1];
522   }
523   if (src1->f[2] != 0) {
524      dst->f[2] = src0->f[2] / src1->f[2];
525   }
526   if (src1->f[3] != 0) {
527      dst->f[3] = src0->f[3] / src1->f[3];
528   }
529}
530
531#if 0
532static void
533micro_udiv(
534   union tgsi_exec_channel *dst,
535   const union tgsi_exec_channel *src0,
536   const union tgsi_exec_channel *src1 )
537{
538   dst->u[0] = src0->u[0] / src1->u[0];
539   dst->u[1] = src0->u[1] / src1->u[1];
540   dst->u[2] = src0->u[2] / src1->u[2];
541   dst->u[3] = src0->u[3] / src1->u[3];
542}
543#endif
544
545static void
546micro_eq(
547   union tgsi_exec_channel *dst,
548   const union tgsi_exec_channel *src0,
549   const union tgsi_exec_channel *src1,
550   const union tgsi_exec_channel *src2,
551   const union tgsi_exec_channel *src3 )
552{
553   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
554   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
555   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
556   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
557}
558
559#if 0
560static void
561micro_ieq(
562   union tgsi_exec_channel *dst,
563   const union tgsi_exec_channel *src0,
564   const union tgsi_exec_channel *src1,
565   const union tgsi_exec_channel *src2,
566   const union tgsi_exec_channel *src3 )
567{
568   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
569   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
570   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
571   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
572}
573#endif
574
575static void
576micro_exp2(
577   union tgsi_exec_channel *dst,
578   const union tgsi_exec_channel *src)
579{
580#if FAST_MATH
581   dst->f[0] = util_fast_exp2( src->f[0] );
582   dst->f[1] = util_fast_exp2( src->f[1] );
583   dst->f[2] = util_fast_exp2( src->f[2] );
584   dst->f[3] = util_fast_exp2( src->f[3] );
585#else
586
587#if DEBUG
588   /* Inf is okay for this instruction, so clamp it to silence assertions. */
589   uint i;
590   union tgsi_exec_channel clamped;
591
592   for (i = 0; i < 4; i++) {
593      if (src->f[i] > 127.99999f) {
594         clamped.f[i] = 127.99999f;
595      } else if (src->f[i] < -126.99999f) {
596         clamped.f[i] = -126.99999f;
597      } else {
598         clamped.f[i] = src->f[i];
599      }
600   }
601   src = &clamped;
602#endif
603
604   dst->f[0] = powf( 2.0f, src->f[0] );
605   dst->f[1] = powf( 2.0f, src->f[1] );
606   dst->f[2] = powf( 2.0f, src->f[2] );
607   dst->f[3] = powf( 2.0f, src->f[3] );
608#endif
609}
610
611#if 0
612static void
613micro_f2ut(
614   union tgsi_exec_channel *dst,
615   const union tgsi_exec_channel *src )
616{
617   dst->u[0] = (uint) src->f[0];
618   dst->u[1] = (uint) src->f[1];
619   dst->u[2] = (uint) src->f[2];
620   dst->u[3] = (uint) src->f[3];
621}
622#endif
623
624static void
625micro_float_clamp(union tgsi_exec_channel *dst,
626                  const union tgsi_exec_channel *src)
627{
628   uint i;
629
630   for (i = 0; i < 4; i++) {
631      if (src->f[i] > 0.0f) {
632         if (src->f[i] > 1.884467e+019f)
633            dst->f[i] = 1.884467e+019f;
634         else if (src->f[i] < 5.42101e-020f)
635            dst->f[i] = 5.42101e-020f;
636         else
637            dst->f[i] = src->f[i];
638      }
639      else {
640         if (src->f[i] < -1.884467e+019f)
641            dst->f[i] = -1.884467e+019f;
642         else if (src->f[i] > -5.42101e-020f)
643            dst->f[i] = -5.42101e-020f;
644         else
645            dst->f[i] = src->f[i];
646      }
647   }
648}
649
650static void
651micro_flr(
652   union tgsi_exec_channel *dst,
653   const union tgsi_exec_channel *src )
654{
655   dst->f[0] = floorf( src->f[0] );
656   dst->f[1] = floorf( src->f[1] );
657   dst->f[2] = floorf( src->f[2] );
658   dst->f[3] = floorf( src->f[3] );
659}
660
661static void
662micro_frc(
663   union tgsi_exec_channel *dst,
664   const union tgsi_exec_channel *src )
665{
666   dst->f[0] = src->f[0] - floorf( src->f[0] );
667   dst->f[1] = src->f[1] - floorf( src->f[1] );
668   dst->f[2] = src->f[2] - floorf( src->f[2] );
669   dst->f[3] = src->f[3] - floorf( src->f[3] );
670}
671
672static void
673micro_i2f(
674   union tgsi_exec_channel *dst,
675   const union tgsi_exec_channel *src )
676{
677   dst->f[0] = (float) src->i[0];
678   dst->f[1] = (float) src->i[1];
679   dst->f[2] = (float) src->i[2];
680   dst->f[3] = (float) src->i[3];
681}
682
683static void
684micro_lg2(
685   union tgsi_exec_channel *dst,
686   const union tgsi_exec_channel *src )
687{
688#if FAST_MATH
689   dst->f[0] = util_fast_log2( src->f[0] );
690   dst->f[1] = util_fast_log2( src->f[1] );
691   dst->f[2] = util_fast_log2( src->f[2] );
692   dst->f[3] = util_fast_log2( src->f[3] );
693#else
694   dst->f[0] = logf( src->f[0] ) * 1.442695f;
695   dst->f[1] = logf( src->f[1] ) * 1.442695f;
696   dst->f[2] = logf( src->f[2] ) * 1.442695f;
697   dst->f[3] = logf( src->f[3] ) * 1.442695f;
698#endif
699}
700
701static void
702micro_le(
703   union tgsi_exec_channel *dst,
704   const union tgsi_exec_channel *src0,
705   const union tgsi_exec_channel *src1,
706   const union tgsi_exec_channel *src2,
707   const union tgsi_exec_channel *src3 )
708{
709   dst->f[0] = src0->f[0] <= src1->f[0] ? src2->f[0] : src3->f[0];
710   dst->f[1] = src0->f[1] <= src1->f[1] ? src2->f[1] : src3->f[1];
711   dst->f[2] = src0->f[2] <= src1->f[2] ? src2->f[2] : src3->f[2];
712   dst->f[3] = src0->f[3] <= src1->f[3] ? src2->f[3] : src3->f[3];
713}
714
715static void
716micro_lt(
717   union tgsi_exec_channel *dst,
718   const union tgsi_exec_channel *src0,
719   const union tgsi_exec_channel *src1,
720   const union tgsi_exec_channel *src2,
721   const union tgsi_exec_channel *src3 )
722{
723   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
724   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
725   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
726   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
727}
728
729#if 0
730static void
731micro_ilt(
732   union tgsi_exec_channel *dst,
733   const union tgsi_exec_channel *src0,
734   const union tgsi_exec_channel *src1,
735   const union tgsi_exec_channel *src2,
736   const union tgsi_exec_channel *src3 )
737{
738   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
739   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
740   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
741   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
742}
743#endif
744
745#if 0
746static void
747micro_ult(
748   union tgsi_exec_channel *dst,
749   const union tgsi_exec_channel *src0,
750   const union tgsi_exec_channel *src1,
751   const union tgsi_exec_channel *src2,
752   const union tgsi_exec_channel *src3 )
753{
754   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
755   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
756   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
757   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
758}
759#endif
760
761static void
762micro_max(
763   union tgsi_exec_channel *dst,
764   const union tgsi_exec_channel *src0,
765   const union tgsi_exec_channel *src1 )
766{
767   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
768   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
769   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
770   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
771}
772
773#if 0
774static void
775micro_imax(
776   union tgsi_exec_channel *dst,
777   const union tgsi_exec_channel *src0,
778   const union tgsi_exec_channel *src1 )
779{
780   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
781   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
782   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
783   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
784}
785#endif
786
787#if 0
788static void
789micro_umax(
790   union tgsi_exec_channel *dst,
791   const union tgsi_exec_channel *src0,
792   const union tgsi_exec_channel *src1 )
793{
794   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
795   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
796   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
797   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
798}
799#endif
800
801static void
802micro_min(
803   union tgsi_exec_channel *dst,
804   const union tgsi_exec_channel *src0,
805   const union tgsi_exec_channel *src1 )
806{
807   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
808   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
809   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
810   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
811}
812
813#if 0
814static void
815micro_imin(
816   union tgsi_exec_channel *dst,
817   const union tgsi_exec_channel *src0,
818   const union tgsi_exec_channel *src1 )
819{
820   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
821   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
822   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
823   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
824}
825#endif
826
827#if 0
828static void
829micro_umin(
830   union tgsi_exec_channel *dst,
831   const union tgsi_exec_channel *src0,
832   const union tgsi_exec_channel *src1 )
833{
834   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
835   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
836   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
837   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
838}
839#endif
840
841#if 0
842static void
843micro_umod(
844   union tgsi_exec_channel *dst,
845   const union tgsi_exec_channel *src0,
846   const union tgsi_exec_channel *src1 )
847{
848   dst->u[0] = src0->u[0] % src1->u[0];
849   dst->u[1] = src0->u[1] % src1->u[1];
850   dst->u[2] = src0->u[2] % src1->u[2];
851   dst->u[3] = src0->u[3] % src1->u[3];
852}
853#endif
854
855static void
856micro_mul(
857   union tgsi_exec_channel *dst,
858   const union tgsi_exec_channel *src0,
859   const union tgsi_exec_channel *src1 )
860{
861   dst->f[0] = src0->f[0] * src1->f[0];
862   dst->f[1] = src0->f[1] * src1->f[1];
863   dst->f[2] = src0->f[2] * src1->f[2];
864   dst->f[3] = src0->f[3] * src1->f[3];
865}
866
867#if 0
868static void
869micro_imul(
870   union tgsi_exec_channel *dst,
871   const union tgsi_exec_channel *src0,
872   const union tgsi_exec_channel *src1 )
873{
874   dst->i[0] = src0->i[0] * src1->i[0];
875   dst->i[1] = src0->i[1] * src1->i[1];
876   dst->i[2] = src0->i[2] * src1->i[2];
877   dst->i[3] = src0->i[3] * src1->i[3];
878}
879#endif
880
881#if 0
882static void
883micro_imul64(
884   union tgsi_exec_channel *dst0,
885   union tgsi_exec_channel *dst1,
886   const union tgsi_exec_channel *src0,
887   const union tgsi_exec_channel *src1 )
888{
889   dst1->i[0] = src0->i[0] * src1->i[0];
890   dst1->i[1] = src0->i[1] * src1->i[1];
891   dst1->i[2] = src0->i[2] * src1->i[2];
892   dst1->i[3] = src0->i[3] * src1->i[3];
893   dst0->i[0] = 0;
894   dst0->i[1] = 0;
895   dst0->i[2] = 0;
896   dst0->i[3] = 0;
897}
898#endif
899
900#if 0
901static void
902micro_umul64(
903   union tgsi_exec_channel *dst0,
904   union tgsi_exec_channel *dst1,
905   const union tgsi_exec_channel *src0,
906   const union tgsi_exec_channel *src1 )
907{
908   dst1->u[0] = src0->u[0] * src1->u[0];
909   dst1->u[1] = src0->u[1] * src1->u[1];
910   dst1->u[2] = src0->u[2] * src1->u[2];
911   dst1->u[3] = src0->u[3] * src1->u[3];
912   dst0->u[0] = 0;
913   dst0->u[1] = 0;
914   dst0->u[2] = 0;
915   dst0->u[3] = 0;
916}
917#endif
918
919
920#if 0
921static void
922micro_movc(
923   union tgsi_exec_channel *dst,
924   const union tgsi_exec_channel *src0,
925   const union tgsi_exec_channel *src1,
926   const union tgsi_exec_channel *src2 )
927{
928   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
929   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
930   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
931   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
932}
933#endif
934
935static void
936micro_neg(
937   union tgsi_exec_channel *dst,
938   const union tgsi_exec_channel *src )
939{
940   dst->f[0] = -src->f[0];
941   dst->f[1] = -src->f[1];
942   dst->f[2] = -src->f[2];
943   dst->f[3] = -src->f[3];
944}
945
946#if 0
947static void
948micro_ineg(
949   union tgsi_exec_channel *dst,
950   const union tgsi_exec_channel *src )
951{
952   dst->i[0] = -src->i[0];
953   dst->i[1] = -src->i[1];
954   dst->i[2] = -src->i[2];
955   dst->i[3] = -src->i[3];
956}
957#endif
958
959static void
960micro_not(
961   union tgsi_exec_channel *dst,
962   const union tgsi_exec_channel *src )
963{
964   dst->u[0] = ~src->u[0];
965   dst->u[1] = ~src->u[1];
966   dst->u[2] = ~src->u[2];
967   dst->u[3] = ~src->u[3];
968}
969
970static void
971micro_or(
972   union tgsi_exec_channel *dst,
973   const union tgsi_exec_channel *src0,
974   const union tgsi_exec_channel *src1 )
975{
976   dst->u[0] = src0->u[0] | src1->u[0];
977   dst->u[1] = src0->u[1] | src1->u[1];
978   dst->u[2] = src0->u[2] | src1->u[2];
979   dst->u[3] = src0->u[3] | src1->u[3];
980}
981
982static void
983micro_pow(
984   union tgsi_exec_channel *dst,
985   const union tgsi_exec_channel *src0,
986   const union tgsi_exec_channel *src1 )
987{
988#if FAST_MATH
989   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
990   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
991   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
992   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
993#else
994   dst->f[0] = powf( src0->f[0], src1->f[0] );
995   dst->f[1] = powf( src0->f[1], src1->f[1] );
996   dst->f[2] = powf( src0->f[2], src1->f[2] );
997   dst->f[3] = powf( src0->f[3], src1->f[3] );
998#endif
999}
1000
1001static void
1002micro_rnd(
1003   union tgsi_exec_channel *dst,
1004   const union tgsi_exec_channel *src )
1005{
1006   dst->f[0] = floorf( src->f[0] + 0.5f );
1007   dst->f[1] = floorf( src->f[1] + 0.5f );
1008   dst->f[2] = floorf( src->f[2] + 0.5f );
1009   dst->f[3] = floorf( src->f[3] + 0.5f );
1010}
1011
1012static void
1013micro_sgn(
1014   union tgsi_exec_channel *dst,
1015   const union tgsi_exec_channel *src )
1016{
1017   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
1018   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
1019   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
1020   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
1021}
1022
1023static void
1024micro_shl(
1025   union tgsi_exec_channel *dst,
1026   const union tgsi_exec_channel *src0,
1027   const union tgsi_exec_channel *src1 )
1028{
1029   dst->i[0] = src0->i[0] << src1->i[0];
1030   dst->i[1] = src0->i[1] << src1->i[1];
1031   dst->i[2] = src0->i[2] << src1->i[2];
1032   dst->i[3] = src0->i[3] << src1->i[3];
1033}
1034
1035static void
1036micro_ishr(
1037   union tgsi_exec_channel *dst,
1038   const union tgsi_exec_channel *src0,
1039   const union tgsi_exec_channel *src1 )
1040{
1041   dst->i[0] = src0->i[0] >> src1->i[0];
1042   dst->i[1] = src0->i[1] >> src1->i[1];
1043   dst->i[2] = src0->i[2] >> src1->i[2];
1044   dst->i[3] = src0->i[3] >> src1->i[3];
1045}
1046
1047static void
1048micro_trunc(
1049   union tgsi_exec_channel *dst,
1050   const union tgsi_exec_channel *src0 )
1051{
1052   dst->f[0] = (float) (int) src0->f[0];
1053   dst->f[1] = (float) (int) src0->f[1];
1054   dst->f[2] = (float) (int) src0->f[2];
1055   dst->f[3] = (float) (int) src0->f[3];
1056}
1057
1058#if 0
1059static void
1060micro_ushr(
1061   union tgsi_exec_channel *dst,
1062   const union tgsi_exec_channel *src0,
1063   const union tgsi_exec_channel *src1 )
1064{
1065   dst->u[0] = src0->u[0] >> src1->u[0];
1066   dst->u[1] = src0->u[1] >> src1->u[1];
1067   dst->u[2] = src0->u[2] >> src1->u[2];
1068   dst->u[3] = src0->u[3] >> src1->u[3];
1069}
1070#endif
1071
1072static void
1073micro_sin(
1074   union tgsi_exec_channel *dst,
1075   const union tgsi_exec_channel *src )
1076{
1077   dst->f[0] = sinf( src->f[0] );
1078   dst->f[1] = sinf( src->f[1] );
1079   dst->f[2] = sinf( src->f[2] );
1080   dst->f[3] = sinf( src->f[3] );
1081}
1082
1083static void
1084micro_sqrt( union tgsi_exec_channel *dst,
1085            const union tgsi_exec_channel *src )
1086{
1087   dst->f[0] = sqrtf( src->f[0] );
1088   dst->f[1] = sqrtf( src->f[1] );
1089   dst->f[2] = sqrtf( src->f[2] );
1090   dst->f[3] = sqrtf( src->f[3] );
1091}
1092
1093static void
1094micro_sub(
1095   union tgsi_exec_channel *dst,
1096   const union tgsi_exec_channel *src0,
1097   const union tgsi_exec_channel *src1 )
1098{
1099   dst->f[0] = src0->f[0] - src1->f[0];
1100   dst->f[1] = src0->f[1] - src1->f[1];
1101   dst->f[2] = src0->f[2] - src1->f[2];
1102   dst->f[3] = src0->f[3] - src1->f[3];
1103}
1104
1105#if 0
1106static void
1107micro_u2f(
1108   union tgsi_exec_channel *dst,
1109   const union tgsi_exec_channel *src )
1110{
1111   dst->f[0] = (float) src->u[0];
1112   dst->f[1] = (float) src->u[1];
1113   dst->f[2] = (float) src->u[2];
1114   dst->f[3] = (float) src->u[3];
1115}
1116#endif
1117
1118static void
1119micro_xor(
1120   union tgsi_exec_channel *dst,
1121   const union tgsi_exec_channel *src0,
1122   const union tgsi_exec_channel *src1 )
1123{
1124   dst->u[0] = src0->u[0] ^ src1->u[0];
1125   dst->u[1] = src0->u[1] ^ src1->u[1];
1126   dst->u[2] = src0->u[2] ^ src1->u[2];
1127   dst->u[3] = src0->u[3] ^ src1->u[3];
1128}
1129
1130static void
1131fetch_src_file_channel(
1132   const struct tgsi_exec_machine *mach,
1133   const uint file,
1134   const uint swizzle,
1135   const union tgsi_exec_channel *index,
1136   union tgsi_exec_channel *chan )
1137{
1138   switch( swizzle ) {
1139   case TGSI_SWIZZLE_X:
1140   case TGSI_SWIZZLE_Y:
1141   case TGSI_SWIZZLE_Z:
1142   case TGSI_SWIZZLE_W:
1143      switch( file ) {
1144      case TGSI_FILE_CONSTANT:
1145         assert(mach->Consts);
1146         if (index->i[0] < 0)
1147            chan->f[0] = 0.0f;
1148         else
1149            chan->f[0] = mach->Consts[index->i[0]][swizzle];
1150         if (index->i[1] < 0)
1151            chan->f[1] = 0.0f;
1152         else
1153            chan->f[1] = mach->Consts[index->i[1]][swizzle];
1154         if (index->i[2] < 0)
1155            chan->f[2] = 0.0f;
1156         else
1157            chan->f[2] = mach->Consts[index->i[2]][swizzle];
1158         if (index->i[3] < 0)
1159            chan->f[3] = 0.0f;
1160         else
1161            chan->f[3] = mach->Consts[index->i[3]][swizzle];
1162         break;
1163
1164      case TGSI_FILE_INPUT:
1165      case TGSI_FILE_SYSTEM_VALUE:
1166         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
1167         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
1168         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
1169         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
1170         break;
1171
1172      case TGSI_FILE_TEMPORARY:
1173         assert(index->i[0] < TGSI_EXEC_NUM_TEMPS);
1174         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
1175         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
1176         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
1177         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
1178         break;
1179
1180      case TGSI_FILE_IMMEDIATE:
1181         assert( index->i[0] < (int) mach->ImmLimit );
1182         chan->f[0] = mach->Imms[index->i[0]][swizzle];
1183         assert( index->i[1] < (int) mach->ImmLimit );
1184         chan->f[1] = mach->Imms[index->i[1]][swizzle];
1185         assert( index->i[2] < (int) mach->ImmLimit );
1186         chan->f[2] = mach->Imms[index->i[2]][swizzle];
1187         assert( index->i[3] < (int) mach->ImmLimit );
1188         chan->f[3] = mach->Imms[index->i[3]][swizzle];
1189         break;
1190
1191      case TGSI_FILE_ADDRESS:
1192         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
1193         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
1194         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
1195         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
1196         break;
1197
1198      case TGSI_FILE_PREDICATE:
1199         assert(index->i[0] < TGSI_EXEC_NUM_PREDS);
1200         assert(index->i[1] < TGSI_EXEC_NUM_PREDS);
1201         assert(index->i[2] < TGSI_EXEC_NUM_PREDS);
1202         assert(index->i[3] < TGSI_EXEC_NUM_PREDS);
1203         chan->u[0] = mach->Predicates[0].xyzw[swizzle].u[0];
1204         chan->u[1] = mach->Predicates[0].xyzw[swizzle].u[1];
1205         chan->u[2] = mach->Predicates[0].xyzw[swizzle].u[2];
1206         chan->u[3] = mach->Predicates[0].xyzw[swizzle].u[3];
1207         break;
1208
1209      case TGSI_FILE_OUTPUT:
1210         /* vertex/fragment output vars can be read too */
1211         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
1212         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
1213         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
1214         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
1215         break;
1216
1217      default:
1218         assert( 0 );
1219      }
1220      break;
1221
1222   default:
1223      assert( 0 );
1224   }
1225}
1226
1227static void
1228fetch_source(
1229   const struct tgsi_exec_machine *mach,
1230   union tgsi_exec_channel *chan,
1231   const struct tgsi_full_src_register *reg,
1232   const uint chan_index )
1233{
1234   union tgsi_exec_channel index;
1235   uint swizzle;
1236
1237   /* We start with a direct index into a register file.
1238    *
1239    *    file[1],
1240    *    where:
1241    *       file = Register.File
1242    *       [1] = Register.Index
1243    */
1244   index.i[0] =
1245   index.i[1] =
1246   index.i[2] =
1247   index.i[3] = reg->Register.Index;
1248
1249   /* There is an extra source register that indirectly subscripts
1250    * a register file. The direct index now becomes an offset
1251    * that is being added to the indirect register.
1252    *
1253    *    file[ind[2].x+1],
1254    *    where:
1255    *       ind = Indirect.File
1256    *       [2] = Indirect.Index
1257    *       .x = Indirect.SwizzleX
1258    */
1259   if (reg->Register.Indirect) {
1260      union tgsi_exec_channel index2;
1261      union tgsi_exec_channel indir_index;
1262      const uint execmask = mach->ExecMask;
1263      uint i;
1264
1265      /* which address register (always zero now) */
1266      index2.i[0] =
1267      index2.i[1] =
1268      index2.i[2] =
1269      index2.i[3] = reg->Indirect.Index;
1270
1271      /* get current value of address register[swizzle] */
1272      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1273      fetch_src_file_channel(
1274         mach,
1275         reg->Indirect.File,
1276         swizzle,
1277         &index2,
1278         &indir_index );
1279
1280      /* add value of address register to the offset */
1281      index.i[0] += (int) indir_index.f[0];
1282      index.i[1] += (int) indir_index.f[1];
1283      index.i[2] += (int) indir_index.f[2];
1284      index.i[3] += (int) indir_index.f[3];
1285
1286      /* for disabled execution channels, zero-out the index to
1287       * avoid using a potential garbage value.
1288       */
1289      for (i = 0; i < QUAD_SIZE; i++) {
1290         if ((execmask & (1 << i)) == 0)
1291            index.i[i] = 0;
1292      }
1293   }
1294
1295   /* There is an extra source register that is a second
1296    * subscript to a register file. Effectively it means that
1297    * the register file is actually a 2D array of registers.
1298    *
1299    *    file[1][3] == file[1*sizeof(file[1])+3],
1300    *    where:
1301    *       [3] = Dimension.Index
1302    */
1303   if (reg->Register.Dimension) {
1304      /* The size of the first-order array depends on the register file type.
1305       * We need to multiply the index to the first array to get an effective,
1306       * "flat" index that points to the beginning of the second-order array.
1307       */
1308      switch (reg->Register.File) {
1309      case TGSI_FILE_INPUT:
1310      case TGSI_FILE_SYSTEM_VALUE:
1311         index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1312         index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1313         index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1314         index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1315         break;
1316      case TGSI_FILE_CONSTANT:
1317         index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
1318         index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
1319         index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
1320         index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
1321         break;
1322      default:
1323         assert( 0 );
1324      }
1325
1326      index.i[0] += reg->Dimension.Index;
1327      index.i[1] += reg->Dimension.Index;
1328      index.i[2] += reg->Dimension.Index;
1329      index.i[3] += reg->Dimension.Index;
1330
1331      /* Again, the second subscript index can be addressed indirectly
1332       * identically to the first one.
1333       * Nothing stops us from indirectly addressing the indirect register,
1334       * but there is no need for that, so we won't exercise it.
1335       *
1336       *    file[1][ind[4].y+3],
1337       *    where:
1338       *       ind = DimIndirect.File
1339       *       [4] = DimIndirect.Index
1340       *       .y = DimIndirect.SwizzleX
1341       */
1342      if (reg->Dimension.Indirect) {
1343         union tgsi_exec_channel index2;
1344         union tgsi_exec_channel indir_index;
1345         const uint execmask = mach->ExecMask;
1346         uint i;
1347
1348         index2.i[0] =
1349         index2.i[1] =
1350         index2.i[2] =
1351         index2.i[3] = reg->DimIndirect.Index;
1352
1353         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1354         fetch_src_file_channel(
1355            mach,
1356            reg->DimIndirect.File,
1357            swizzle,
1358            &index2,
1359            &indir_index );
1360
1361         index.i[0] += (int) indir_index.f[0];
1362         index.i[1] += (int) indir_index.f[1];
1363         index.i[2] += (int) indir_index.f[2];
1364         index.i[3] += (int) indir_index.f[3];
1365
1366         /* for disabled execution channels, zero-out the index to
1367          * avoid using a potential garbage value.
1368          */
1369         for (i = 0; i < QUAD_SIZE; i++) {
1370            if ((execmask & (1 << i)) == 0)
1371               index.i[i] = 0;
1372         }
1373      }
1374
1375      /* If by any chance there was a need for a 3D array of register
1376       * files, we would have to check whether Dimension is followed
1377       * by a dimension register and continue the saga.
1378       */
1379   }
1380
1381   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1382   fetch_src_file_channel(
1383      mach,
1384      reg->Register.File,
1385      swizzle,
1386      &index,
1387      chan );
1388
1389   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
1390   case TGSI_UTIL_SIGN_CLEAR:
1391      micro_abs( chan, chan );
1392      break;
1393
1394   case TGSI_UTIL_SIGN_SET:
1395      micro_abs( chan, chan );
1396      micro_neg( chan, chan );
1397      break;
1398
1399   case TGSI_UTIL_SIGN_TOGGLE:
1400      micro_neg( chan, chan );
1401      break;
1402
1403   case TGSI_UTIL_SIGN_KEEP:
1404      break;
1405   }
1406}
1407
1408static void
1409store_dest(
1410   struct tgsi_exec_machine *mach,
1411   const union tgsi_exec_channel *chan,
1412   const struct tgsi_full_dst_register *reg,
1413   const struct tgsi_full_instruction *inst,
1414   uint chan_index )
1415{
1416   uint i;
1417   union tgsi_exec_channel null;
1418   union tgsi_exec_channel *dst;
1419   uint execmask = mach->ExecMask;
1420   int offset = 0;  /* indirection offset */
1421   int index;
1422
1423#ifdef DEBUG
1424   check_inf_or_nan(chan);
1425#endif
1426
1427   /* There is an extra source register that indirectly subscripts
1428    * a register file. The direct index now becomes an offset
1429    * that is being added to the indirect register.
1430    *
1431    *    file[ind[2].x+1],
1432    *    where:
1433    *       ind = Indirect.File
1434    *       [2] = Indirect.Index
1435    *       .x = Indirect.SwizzleX
1436    */
1437   if (reg->Register.Indirect) {
1438      union tgsi_exec_channel index;
1439      union tgsi_exec_channel indir_index;
1440      uint swizzle;
1441
1442      /* which address register (always zero for now) */
1443      index.i[0] =
1444      index.i[1] =
1445      index.i[2] =
1446      index.i[3] = reg->Indirect.Index;
1447
1448      /* get current value of address register[swizzle] */
1449      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1450
1451      /* fetch values from the address/indirection register */
1452      fetch_src_file_channel(
1453         mach,
1454         reg->Indirect.File,
1455         swizzle,
1456         &index,
1457         &indir_index );
1458
1459      /* save indirection offset */
1460      offset = (int) indir_index.f[0];
1461   }
1462
1463   switch (reg->Register.File) {
1464   case TGSI_FILE_NULL:
1465      dst = &null;
1466      break;
1467
1468   case TGSI_FILE_OUTPUT:
1469      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1470         + reg->Register.Index;
1471      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1472#if 0
1473      if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1474         fprintf(stderr, "STORING OUT[%d] mask(%d), = (", index, execmask);
1475         for (i = 0; i < QUAD_SIZE; i++)
1476            if (execmask & (1 << i))
1477               fprintf(stderr, "%f, ", chan->f[i]);
1478         fprintf(stderr, ")\n");
1479      }
1480#endif
1481      break;
1482
1483   case TGSI_FILE_TEMPORARY:
1484      index = reg->Register.Index;
1485      assert( index < TGSI_EXEC_NUM_TEMPS );
1486      dst = &mach->Temps[offset + index].xyzw[chan_index];
1487      break;
1488
1489   case TGSI_FILE_ADDRESS:
1490      index = reg->Register.Index;
1491      dst = &mach->Addrs[index].xyzw[chan_index];
1492      break;
1493
1494   case TGSI_FILE_LOOP:
1495      assert(reg->Register.Index == 0);
1496      assert(mach->LoopCounterStackTop > 0);
1497      assert(chan_index == CHAN_X);
1498      dst = &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[chan_index];
1499      break;
1500
1501   case TGSI_FILE_PREDICATE:
1502      index = reg->Register.Index;
1503      assert(index < TGSI_EXEC_NUM_PREDS);
1504      dst = &mach->Predicates[index].xyzw[chan_index];
1505      break;
1506
1507   default:
1508      assert( 0 );
1509      return;
1510   }
1511
1512   if (inst->Instruction.Predicate) {
1513      uint swizzle;
1514      union tgsi_exec_channel *pred;
1515
1516      switch (chan_index) {
1517      case CHAN_X:
1518         swizzle = inst->Predicate.SwizzleX;
1519         break;
1520      case CHAN_Y:
1521         swizzle = inst->Predicate.SwizzleY;
1522         break;
1523      case CHAN_Z:
1524         swizzle = inst->Predicate.SwizzleZ;
1525         break;
1526      case CHAN_W:
1527         swizzle = inst->Predicate.SwizzleW;
1528         break;
1529      default:
1530         assert(0);
1531         return;
1532      }
1533
1534      assert(inst->Predicate.Index == 0);
1535
1536      pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1537
1538      if (inst->Predicate.Negate) {
1539         for (i = 0; i < QUAD_SIZE; i++) {
1540            if (pred->u[i]) {
1541               execmask &= ~(1 << i);
1542            }
1543         }
1544      } else {
1545         for (i = 0; i < QUAD_SIZE; i++) {
1546            if (!pred->u[i]) {
1547               execmask &= ~(1 << i);
1548            }
1549         }
1550      }
1551   }
1552
1553   switch (inst->Instruction.Saturate) {
1554   case TGSI_SAT_NONE:
1555      for (i = 0; i < QUAD_SIZE; i++)
1556         if (execmask & (1 << i))
1557            dst->i[i] = chan->i[i];
1558      break;
1559
1560   case TGSI_SAT_ZERO_ONE:
1561      for (i = 0; i < QUAD_SIZE; i++)
1562         if (execmask & (1 << i)) {
1563            if (chan->f[i] < 0.0f)
1564               dst->f[i] = 0.0f;
1565            else if (chan->f[i] > 1.0f)
1566               dst->f[i] = 1.0f;
1567            else
1568               dst->i[i] = chan->i[i];
1569         }
1570      break;
1571
1572   case TGSI_SAT_MINUS_PLUS_ONE:
1573      for (i = 0; i < QUAD_SIZE; i++)
1574         if (execmask & (1 << i)) {
1575            if (chan->f[i] < -1.0f)
1576               dst->f[i] = -1.0f;
1577            else if (chan->f[i] > 1.0f)
1578               dst->f[i] = 1.0f;
1579            else
1580               dst->i[i] = chan->i[i];
1581         }
1582      break;
1583
1584   default:
1585      assert( 0 );
1586   }
1587}
1588
1589#define FETCH(VAL,INDEX,CHAN)\
1590    fetch_source (mach, VAL, &inst->Src[INDEX], CHAN)
1591
1592#define STORE(VAL,INDEX,CHAN)\
1593    store_dest (mach, VAL, &inst->Dst[INDEX], inst, CHAN )
1594
1595
1596/**
1597 * Execute ARB-style KIL which is predicated by a src register.
1598 * Kill fragment if any of the four values is less than zero.
1599 */
1600static void
1601exec_kil(struct tgsi_exec_machine *mach,
1602         const struct tgsi_full_instruction *inst)
1603{
1604   uint uniquemask;
1605   uint chan_index;
1606   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1607   union tgsi_exec_channel r[1];
1608
1609   /* This mask stores component bits that were already tested. */
1610   uniquemask = 0;
1611
1612   for (chan_index = 0; chan_index < 4; chan_index++)
1613   {
1614      uint swizzle;
1615      uint i;
1616
1617      /* unswizzle channel */
1618      swizzle = tgsi_util_get_full_src_register_swizzle (
1619                        &inst->Src[0],
1620                        chan_index);
1621
1622      /* check if the component has not been already tested */
1623      if (uniquemask & (1 << swizzle))
1624         continue;
1625      uniquemask |= 1 << swizzle;
1626
1627      FETCH(&r[0], 0, chan_index);
1628      for (i = 0; i < 4; i++)
1629         if (r[0].f[i] < 0.0f)
1630            kilmask |= 1 << i;
1631   }
1632
1633   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1634}
1635
1636/**
1637 * Execute NVIDIA-style KIL which is predicated by a condition code.
1638 * Kill fragment if the condition code is TRUE.
1639 */
1640static void
1641exec_kilp(struct tgsi_exec_machine *mach,
1642          const struct tgsi_full_instruction *inst)
1643{
1644   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1645
1646   /* "unconditional" kil */
1647   kilmask = mach->ExecMask;
1648   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1649}
1650
1651static void
1652emit_vertex(struct tgsi_exec_machine *mach)
1653{
1654   /* FIXME: check for exec mask correctly
1655   unsigned i;
1656   for (i = 0; i < QUAD_SIZE; ++i) {
1657         if ((mach->ExecMask & (1 << i)))
1658   */
1659   if (mach->ExecMask) {
1660      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1661      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1662   }
1663}
1664
1665static void
1666emit_primitive(struct tgsi_exec_machine *mach)
1667{
1668   unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1669   /* FIXME: check for exec mask correctly
1670   unsigned i;
1671   for (i = 0; i < QUAD_SIZE; ++i) {
1672         if ((mach->ExecMask & (1 << i)))
1673   */
1674   if (mach->ExecMask) {
1675      ++(*prim_count);
1676      debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1677      mach->Primitives[*prim_count] = 0;
1678   }
1679}
1680
1681/*
1682 * Fetch a four texture samples using STR texture coordinates.
1683 */
1684static void
1685fetch_texel( struct tgsi_sampler *sampler,
1686             const union tgsi_exec_channel *s,
1687             const union tgsi_exec_channel *t,
1688             const union tgsi_exec_channel *p,
1689             float lodbias,  /* XXX should be float[4] */
1690             union tgsi_exec_channel *r,
1691             union tgsi_exec_channel *g,
1692             union tgsi_exec_channel *b,
1693             union tgsi_exec_channel *a )
1694{
1695   uint j;
1696   float rgba[NUM_CHANNELS][QUAD_SIZE];
1697
1698   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
1699
1700   for (j = 0; j < 4; j++) {
1701      r->f[j] = rgba[0][j];
1702      g->f[j] = rgba[1][j];
1703      b->f[j] = rgba[2][j];
1704      a->f[j] = rgba[3][j];
1705   }
1706}
1707
1708
1709static void
1710exec_tex(struct tgsi_exec_machine *mach,
1711         const struct tgsi_full_instruction *inst,
1712         boolean biasLod,
1713         boolean projected)
1714{
1715   const uint unit = inst->Src[1].Register.Index;
1716   union tgsi_exec_channel r[4];
1717   uint chan_index;
1718   float lodBias;
1719
1720   /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
1721
1722   switch (inst->Texture.Texture) {
1723   case TGSI_TEXTURE_1D:
1724   case TGSI_TEXTURE_SHADOW1D:
1725
1726      FETCH(&r[0], 0, CHAN_X);
1727
1728      if (projected) {
1729         FETCH(&r[1], 0, CHAN_W);
1730         micro_div( &r[0], &r[0], &r[1] );
1731      }
1732
1733      if (biasLod) {
1734         FETCH(&r[1], 0, CHAN_W);
1735         lodBias = r[2].f[0];
1736      }
1737      else
1738         lodBias = 0.0;
1739
1740      fetch_texel(mach->Samplers[unit],
1741                  &r[0], &ZeroVec, &ZeroVec, lodBias,  /* S, T, P, BIAS */
1742                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
1743      break;
1744
1745   case TGSI_TEXTURE_2D:
1746   case TGSI_TEXTURE_RECT:
1747   case TGSI_TEXTURE_SHADOW2D:
1748   case TGSI_TEXTURE_SHADOWRECT:
1749
1750      FETCH(&r[0], 0, CHAN_X);
1751      FETCH(&r[1], 0, CHAN_Y);
1752      FETCH(&r[2], 0, CHAN_Z);
1753
1754      if (projected) {
1755         FETCH(&r[3], 0, CHAN_W);
1756         micro_div( &r[0], &r[0], &r[3] );
1757         micro_div( &r[1], &r[1], &r[3] );
1758         micro_div( &r[2], &r[2], &r[3] );
1759      }
1760
1761      if (biasLod) {
1762         FETCH(&r[3], 0, CHAN_W);
1763         lodBias = r[3].f[0];
1764      }
1765      else
1766         lodBias = 0.0;
1767
1768      fetch_texel(mach->Samplers[unit],
1769                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
1770                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1771      break;
1772
1773   case TGSI_TEXTURE_3D:
1774   case TGSI_TEXTURE_CUBE:
1775
1776      FETCH(&r[0], 0, CHAN_X);
1777      FETCH(&r[1], 0, CHAN_Y);
1778      FETCH(&r[2], 0, CHAN_Z);
1779
1780      if (projected) {
1781         FETCH(&r[3], 0, CHAN_W);
1782         micro_div( &r[0], &r[0], &r[3] );
1783         micro_div( &r[1], &r[1], &r[3] );
1784         micro_div( &r[2], &r[2], &r[3] );
1785      }
1786
1787      if (biasLod) {
1788         FETCH(&r[3], 0, CHAN_W);
1789         lodBias = r[3].f[0];
1790      }
1791      else
1792         lodBias = 0.0;
1793
1794      fetch_texel(mach->Samplers[unit],
1795                  &r[0], &r[1], &r[2], lodBias,
1796                  &r[0], &r[1], &r[2], &r[3]);
1797      break;
1798
1799   default:
1800      assert (0);
1801   }
1802
1803   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1804      STORE( &r[chan_index], 0, chan_index );
1805   }
1806}
1807
1808static void
1809exec_txd(struct tgsi_exec_machine *mach,
1810         const struct tgsi_full_instruction *inst)
1811{
1812   const uint unit = inst->Src[3].Register.Index;
1813   union tgsi_exec_channel r[4];
1814   uint chan_index;
1815
1816   /*
1817    * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1818    */
1819
1820   switch (inst->Texture.Texture) {
1821   case TGSI_TEXTURE_1D:
1822   case TGSI_TEXTURE_SHADOW1D:
1823
1824      FETCH(&r[0], 0, CHAN_X);
1825
1826      fetch_texel(mach->Samplers[unit],
1827                  &r[0], &ZeroVec, &ZeroVec, 0.0f,  /* S, T, P, BIAS */
1828                  &r[0], &r[1], &r[2], &r[3]);      /* R, G, B, A */
1829      break;
1830
1831   case TGSI_TEXTURE_2D:
1832   case TGSI_TEXTURE_RECT:
1833   case TGSI_TEXTURE_SHADOW2D:
1834   case TGSI_TEXTURE_SHADOWRECT:
1835
1836      FETCH(&r[0], 0, CHAN_X);
1837      FETCH(&r[1], 0, CHAN_Y);
1838      FETCH(&r[2], 0, CHAN_Z);
1839
1840      fetch_texel(mach->Samplers[unit],
1841                  &r[0], &r[1], &r[2], 0.0f,    /* inputs */
1842                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1843      break;
1844
1845   case TGSI_TEXTURE_3D:
1846   case TGSI_TEXTURE_CUBE:
1847
1848      FETCH(&r[0], 0, CHAN_X);
1849      FETCH(&r[1], 0, CHAN_Y);
1850      FETCH(&r[2], 0, CHAN_Z);
1851
1852      fetch_texel(mach->Samplers[unit],
1853                  &r[0], &r[1], &r[2], 0.0f,
1854                  &r[0], &r[1], &r[2], &r[3]);
1855      break;
1856
1857   default:
1858      assert(0);
1859   }
1860
1861   FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1862      STORE(&r[chan_index], 0, chan_index);
1863   }
1864}
1865
1866
1867/**
1868 * Evaluate a constant-valued coefficient at the position of the
1869 * current quad.
1870 */
1871static void
1872eval_constant_coef(
1873   struct tgsi_exec_machine *mach,
1874   unsigned attrib,
1875   unsigned chan )
1876{
1877   unsigned i;
1878
1879   for( i = 0; i < QUAD_SIZE; i++ ) {
1880      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1881   }
1882}
1883
1884/**
1885 * Evaluate a linear-valued coefficient at the position of the
1886 * current quad.
1887 */
1888static void
1889eval_linear_coef(
1890   struct tgsi_exec_machine *mach,
1891   unsigned attrib,
1892   unsigned chan )
1893{
1894   const float x = mach->QuadPos.xyzw[0].f[0];
1895   const float y = mach->QuadPos.xyzw[1].f[0];
1896   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1897   const float dady = mach->InterpCoefs[attrib].dady[chan];
1898   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1899   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1900   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1901   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1902   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1903}
1904
1905/**
1906 * Evaluate a perspective-valued coefficient at the position of the
1907 * current quad.
1908 */
1909static void
1910eval_perspective_coef(
1911   struct tgsi_exec_machine *mach,
1912   unsigned attrib,
1913   unsigned chan )
1914{
1915   const float x = mach->QuadPos.xyzw[0].f[0];
1916   const float y = mach->QuadPos.xyzw[1].f[0];
1917   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1918   const float dady = mach->InterpCoefs[attrib].dady[chan];
1919   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1920   const float *w = mach->QuadPos.xyzw[3].f;
1921   /* divide by W here */
1922   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1923   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1924   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1925   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1926}
1927
1928
1929typedef void (* eval_coef_func)(
1930   struct tgsi_exec_machine *mach,
1931   unsigned attrib,
1932   unsigned chan );
1933
1934static void
1935exec_declaration(struct tgsi_exec_machine *mach,
1936                 const struct tgsi_full_declaration *decl)
1937{
1938   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
1939      if (decl->Declaration.File == TGSI_FILE_INPUT ||
1940          decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1941         uint first, last, mask;
1942
1943         first = decl->Range.First;
1944         last = decl->Range.Last;
1945         mask = decl->Declaration.UsageMask;
1946
1947         if (decl->Semantic.Name == TGSI_SEMANTIC_POSITION) {
1948            assert(decl->Semantic.Index == 0);
1949            assert(first == last);
1950            assert(mask = TGSI_WRITEMASK_XYZW);
1951
1952            mach->Inputs[first] = mach->QuadPos;
1953         } else if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
1954            uint i;
1955
1956            assert(decl->Semantic.Index == 0);
1957            assert(first == last);
1958
1959            for (i = 0; i < QUAD_SIZE; i++) {
1960               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
1961            }
1962         } else {
1963            eval_coef_func eval;
1964            uint i, j;
1965
1966            switch (decl->Declaration.Interpolate) {
1967            case TGSI_INTERPOLATE_CONSTANT:
1968               eval = eval_constant_coef;
1969               break;
1970
1971            case TGSI_INTERPOLATE_LINEAR:
1972               eval = eval_linear_coef;
1973               break;
1974
1975            case TGSI_INTERPOLATE_PERSPECTIVE:
1976               eval = eval_perspective_coef;
1977               break;
1978
1979            default:
1980               assert(0);
1981               return;
1982            }
1983
1984            for (j = 0; j < NUM_CHANNELS; j++) {
1985               if (mask & (1 << j)) {
1986                  for (i = first; i <= last; i++) {
1987                     eval(mach, i, j);
1988                  }
1989               }
1990            }
1991         }
1992      }
1993   }
1994}
1995
1996static void
1997exec_instruction(
1998   struct tgsi_exec_machine *mach,
1999   const struct tgsi_full_instruction *inst,
2000   int *pc )
2001{
2002   uint chan_index;
2003   union tgsi_exec_channel r[10];
2004   union tgsi_exec_channel d[8];
2005
2006   (*pc)++;
2007
2008   switch (inst->Instruction.Opcode) {
2009   case TGSI_OPCODE_ARL:
2010   case TGSI_OPCODE_FLR:
2011      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2012         FETCH( &r[0], 0, chan_index );
2013         micro_flr(&d[chan_index], &r[0]);
2014      }
2015      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2016         STORE(&d[chan_index], 0, chan_index);
2017      }
2018      break;
2019
2020   case TGSI_OPCODE_MOV:
2021      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2022         FETCH(&d[chan_index], 0, chan_index);
2023      }
2024      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2025         STORE(&d[chan_index], 0, chan_index);
2026      }
2027      break;
2028
2029   case TGSI_OPCODE_LIT:
2030      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2031         FETCH( &r[0], 0, CHAN_X );
2032         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2033            micro_max(&d[CHAN_Y], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2034         }
2035
2036         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2037            FETCH( &r[1], 0, CHAN_Y );
2038            micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2039
2040            FETCH( &r[2], 0, CHAN_W );
2041            micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
2042            micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
2043            micro_pow( &r[1], &r[1], &r[2] );
2044            micro_lt(&d[CHAN_Z], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2045         }
2046
2047         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2048            STORE(&d[CHAN_Y], 0, CHAN_Y);
2049         }
2050         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2051            STORE(&d[CHAN_Z], 0, CHAN_Z);
2052         }
2053      }
2054      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2055         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2056      }
2057      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2058         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2059      }
2060      break;
2061
2062   case TGSI_OPCODE_RCP:
2063   /* TGSI_OPCODE_RECIP */
2064      FETCH( &r[0], 0, CHAN_X );
2065      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
2066      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2067         STORE( &r[0], 0, chan_index );
2068      }
2069      break;
2070
2071   case TGSI_OPCODE_RSQ:
2072   /* TGSI_OPCODE_RECIPSQRT */
2073      FETCH( &r[0], 0, CHAN_X );
2074      micro_abs( &r[0], &r[0] );
2075      micro_sqrt( &r[0], &r[0] );
2076      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
2077      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2078         STORE( &r[0], 0, chan_index );
2079      }
2080      break;
2081
2082   case TGSI_OPCODE_EXP:
2083      FETCH( &r[0], 0, CHAN_X );
2084      micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
2085      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2086         micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
2087         STORE( &r[2], 0, CHAN_X );        /* store r2 */
2088      }
2089      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2090         micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
2091         STORE( &r[2], 0, CHAN_Y );        /* store r2 */
2092      }
2093      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2094         micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
2095         STORE( &r[2], 0, CHAN_Z );        /* store r2 */
2096      }
2097      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2098         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2099      }
2100      break;
2101
2102   case TGSI_OPCODE_LOG:
2103      FETCH( &r[0], 0, CHAN_X );
2104      micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
2105      micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
2106      micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
2107      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2108         STORE( &r[0], 0, CHAN_X );
2109      }
2110      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2111         micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
2112         micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
2113         STORE( &r[0], 0, CHAN_Y );
2114      }
2115      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2116         STORE( &r[1], 0, CHAN_Z );
2117      }
2118      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2119         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2120      }
2121      break;
2122
2123   case TGSI_OPCODE_MUL:
2124      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2125         FETCH(&r[0], 0, chan_index);
2126         FETCH(&r[1], 1, chan_index);
2127         micro_mul(&d[chan_index], &r[0], &r[1]);
2128      }
2129      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2130         STORE(&d[chan_index], 0, chan_index);
2131      }
2132      break;
2133
2134   case TGSI_OPCODE_ADD:
2135      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2136         FETCH( &r[0], 0, chan_index );
2137         FETCH( &r[1], 1, chan_index );
2138         micro_add(&d[chan_index], &r[0], &r[1]);
2139      }
2140      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2141         STORE(&d[chan_index], 0, chan_index);
2142      }
2143      break;
2144
2145   case TGSI_OPCODE_DP3:
2146   /* TGSI_OPCODE_DOT3 */
2147      FETCH( &r[0], 0, CHAN_X );
2148      FETCH( &r[1], 1, CHAN_X );
2149      micro_mul( &r[0], &r[0], &r[1] );
2150
2151      FETCH( &r[1], 0, CHAN_Y );
2152      FETCH( &r[2], 1, CHAN_Y );
2153      micro_mul( &r[1], &r[1], &r[2] );
2154      micro_add( &r[0], &r[0], &r[1] );
2155
2156      FETCH( &r[1], 0, CHAN_Z );
2157      FETCH( &r[2], 1, CHAN_Z );
2158      micro_mul( &r[1], &r[1], &r[2] );
2159      micro_add( &r[0], &r[0], &r[1] );
2160
2161      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2162         STORE( &r[0], 0, chan_index );
2163      }
2164      break;
2165
2166    case TGSI_OPCODE_DP4:
2167    /* TGSI_OPCODE_DOT4 */
2168       FETCH(&r[0], 0, CHAN_X);
2169       FETCH(&r[1], 1, CHAN_X);
2170
2171       micro_mul( &r[0], &r[0], &r[1] );
2172
2173       FETCH(&r[1], 0, CHAN_Y);
2174       FETCH(&r[2], 1, CHAN_Y);
2175
2176       micro_mul( &r[1], &r[1], &r[2] );
2177       micro_add( &r[0], &r[0], &r[1] );
2178
2179       FETCH(&r[1], 0, CHAN_Z);
2180       FETCH(&r[2], 1, CHAN_Z);
2181
2182       micro_mul( &r[1], &r[1], &r[2] );
2183       micro_add( &r[0], &r[0], &r[1] );
2184
2185       FETCH(&r[1], 0, CHAN_W);
2186       FETCH(&r[2], 1, CHAN_W);
2187
2188       micro_mul( &r[1], &r[1], &r[2] );
2189       micro_add( &r[0], &r[0], &r[1] );
2190
2191      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2192         STORE( &r[0], 0, chan_index );
2193      }
2194      break;
2195
2196   case TGSI_OPCODE_DST:
2197      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2198         FETCH( &r[0], 0, CHAN_Y );
2199         FETCH( &r[1], 1, CHAN_Y);
2200         micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2201      }
2202      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2203         FETCH(&d[CHAN_Z], 0, CHAN_Z);
2204      }
2205      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2206         FETCH(&d[CHAN_W], 1, CHAN_W);
2207      }
2208
2209      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2210         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X);
2211      }
2212      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2213         STORE(&d[CHAN_Y], 0, CHAN_Y);
2214      }
2215      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2216         STORE(&d[CHAN_Z], 0, CHAN_Z);
2217      }
2218      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2219         STORE(&d[CHAN_W], 0, CHAN_W);
2220      }
2221      break;
2222
2223   case TGSI_OPCODE_MIN:
2224      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2225         FETCH(&r[0], 0, chan_index);
2226         FETCH(&r[1], 1, chan_index);
2227
2228         /* XXX use micro_min()?? */
2229         micro_lt(&d[chan_index], &r[0], &r[1], &r[0], &r[1]);
2230      }
2231      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2232         STORE(&d[chan_index], 0, chan_index);
2233      }
2234      break;
2235
2236   case TGSI_OPCODE_MAX:
2237      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2238         FETCH(&r[0], 0, chan_index);
2239         FETCH(&r[1], 1, chan_index);
2240
2241         /* XXX use micro_max()?? */
2242         micro_lt(&d[chan_index], &r[0], &r[1], &r[1], &r[0] );
2243      }
2244      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2245         STORE(&d[chan_index], 0, chan_index);
2246      }
2247      break;
2248
2249   case TGSI_OPCODE_SLT:
2250   /* TGSI_OPCODE_SETLT */
2251      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2252         FETCH( &r[0], 0, chan_index );
2253         FETCH( &r[1], 1, chan_index );
2254         micro_lt(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2255      }
2256      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2257         STORE(&d[chan_index], 0, chan_index);
2258      }
2259      break;
2260
2261   case TGSI_OPCODE_SGE:
2262   /* TGSI_OPCODE_SETGE */
2263      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2264         FETCH( &r[0], 0, chan_index );
2265         FETCH( &r[1], 1, chan_index );
2266         micro_le(&d[chan_index], &r[1], &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2267      }
2268      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2269         STORE(&d[chan_index], 0, chan_index);
2270      }
2271      break;
2272
2273   case TGSI_OPCODE_MAD:
2274   /* TGSI_OPCODE_MADD */
2275      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2276         FETCH( &r[0], 0, chan_index );
2277         FETCH( &r[1], 1, chan_index );
2278         micro_mul( &r[0], &r[0], &r[1] );
2279         FETCH( &r[1], 2, chan_index );
2280         micro_add(&d[chan_index], &r[0], &r[1]);
2281      }
2282      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2283         STORE(&d[chan_index], 0, chan_index);
2284      }
2285      break;
2286
2287   case TGSI_OPCODE_SUB:
2288      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2289         FETCH(&r[0], 0, chan_index);
2290         FETCH(&r[1], 1, chan_index);
2291         micro_sub(&d[chan_index], &r[0], &r[1]);
2292      }
2293      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2294         STORE(&d[chan_index], 0, chan_index);
2295      }
2296      break;
2297
2298   case TGSI_OPCODE_LRP:
2299      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2300         FETCH(&r[0], 0, chan_index);
2301         FETCH(&r[1], 1, chan_index);
2302         FETCH(&r[2], 2, chan_index);
2303         micro_sub( &r[1], &r[1], &r[2] );
2304         micro_mul( &r[0], &r[0], &r[1] );
2305         micro_add(&d[chan_index], &r[0], &r[2]);
2306      }
2307      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2308         STORE(&d[chan_index], 0, chan_index);
2309      }
2310      break;
2311
2312   case TGSI_OPCODE_CND:
2313      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2314         FETCH(&r[0], 0, chan_index);
2315         FETCH(&r[1], 1, chan_index);
2316         FETCH(&r[2], 2, chan_index);
2317         micro_lt(&d[chan_index], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]);
2318      }
2319      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2320         STORE(&d[chan_index], 0, chan_index);
2321      }
2322      break;
2323
2324   case TGSI_OPCODE_DP2A:
2325      FETCH( &r[0], 0, CHAN_X );
2326      FETCH( &r[1], 1, CHAN_X );
2327      micro_mul( &r[0], &r[0], &r[1] );
2328
2329      FETCH( &r[1], 0, CHAN_Y );
2330      FETCH( &r[2], 1, CHAN_Y );
2331      micro_mul( &r[1], &r[1], &r[2] );
2332      micro_add( &r[0], &r[0], &r[1] );
2333
2334      FETCH( &r[2], 2, CHAN_X );
2335      micro_add( &r[0], &r[0], &r[2] );
2336
2337      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2338         STORE( &r[0], 0, chan_index );
2339      }
2340      break;
2341
2342   case TGSI_OPCODE_FRC:
2343      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2344         FETCH( &r[0], 0, chan_index );
2345         micro_frc(&d[chan_index], &r[0]);
2346      }
2347      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2348         STORE(&d[chan_index], 0, chan_index);
2349      }
2350      break;
2351
2352   case TGSI_OPCODE_CLAMP:
2353      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2354         FETCH(&r[0], 0, chan_index);
2355         FETCH(&r[1], 1, chan_index);
2356         micro_max(&r[0], &r[0], &r[1]);
2357         FETCH(&r[1], 2, chan_index);
2358         micro_min(&d[chan_index], &r[0], &r[1]);
2359      }
2360      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2361         STORE(&d[chan_index], 0, chan_index);
2362      }
2363      break;
2364
2365   case TGSI_OPCODE_ROUND:
2366   case TGSI_OPCODE_ARR:
2367      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2368         FETCH( &r[0], 0, chan_index );
2369         micro_rnd(&d[chan_index], &r[0]);
2370      }
2371      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2372         STORE(&d[chan_index], 0, chan_index);
2373      }
2374      break;
2375
2376   case TGSI_OPCODE_EX2:
2377      FETCH(&r[0], 0, CHAN_X);
2378
2379      micro_exp2( &r[0], &r[0] );
2380
2381      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2382         STORE( &r[0], 0, chan_index );
2383      }
2384      break;
2385
2386   case TGSI_OPCODE_LG2:
2387      FETCH( &r[0], 0, CHAN_X );
2388      micro_lg2( &r[0], &r[0] );
2389      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2390         STORE( &r[0], 0, chan_index );
2391      }
2392      break;
2393
2394   case TGSI_OPCODE_POW:
2395      FETCH(&r[0], 0, CHAN_X);
2396      FETCH(&r[1], 1, CHAN_X);
2397
2398      micro_pow( &r[0], &r[0], &r[1] );
2399
2400      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2401         STORE( &r[0], 0, chan_index );
2402      }
2403      break;
2404
2405   case TGSI_OPCODE_XPD:
2406      FETCH(&r[0], 0, CHAN_Y);
2407      FETCH(&r[1], 1, CHAN_Z);
2408
2409      micro_mul( &r[2], &r[0], &r[1] );
2410
2411      FETCH(&r[3], 0, CHAN_Z);
2412      FETCH(&r[4], 1, CHAN_Y);
2413
2414      micro_mul( &r[5], &r[3], &r[4] );
2415      micro_sub(&d[CHAN_X], &r[2], &r[5]);
2416
2417      FETCH(&r[2], 1, CHAN_X);
2418
2419      micro_mul( &r[3], &r[3], &r[2] );
2420
2421      FETCH(&r[5], 0, CHAN_X);
2422
2423      micro_mul( &r[1], &r[1], &r[5] );
2424      micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2425
2426      micro_mul( &r[5], &r[5], &r[4] );
2427      micro_mul( &r[0], &r[0], &r[2] );
2428      micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2429
2430      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2431         STORE(&d[CHAN_X], 0, CHAN_X);
2432      }
2433      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2434         STORE(&d[CHAN_Y], 0, CHAN_Y);
2435      }
2436      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2437         STORE(&d[CHAN_Z], 0, CHAN_Z);
2438      }
2439      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2440         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2441      }
2442      break;
2443
2444    case TGSI_OPCODE_ABS:
2445       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2446          FETCH(&r[0], 0, chan_index);
2447          micro_abs(&d[chan_index], &r[0]);
2448       }
2449       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2450         STORE(&d[chan_index], 0, chan_index);
2451      }
2452       break;
2453
2454   case TGSI_OPCODE_RCC:
2455      FETCH(&r[0], 0, CHAN_X);
2456      micro_div(&r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0]);
2457      micro_float_clamp(&r[0], &r[0]);
2458      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2459         STORE(&r[0], 0, chan_index);
2460      }
2461      break;
2462
2463   case TGSI_OPCODE_DPH:
2464      FETCH(&r[0], 0, CHAN_X);
2465      FETCH(&r[1], 1, CHAN_X);
2466
2467      micro_mul( &r[0], &r[0], &r[1] );
2468
2469      FETCH(&r[1], 0, CHAN_Y);
2470      FETCH(&r[2], 1, CHAN_Y);
2471
2472      micro_mul( &r[1], &r[1], &r[2] );
2473      micro_add( &r[0], &r[0], &r[1] );
2474
2475      FETCH(&r[1], 0, CHAN_Z);
2476      FETCH(&r[2], 1, CHAN_Z);
2477
2478      micro_mul( &r[1], &r[1], &r[2] );
2479      micro_add( &r[0], &r[0], &r[1] );
2480
2481      FETCH(&r[1], 1, CHAN_W);
2482
2483      micro_add( &r[0], &r[0], &r[1] );
2484
2485      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2486         STORE( &r[0], 0, chan_index );
2487      }
2488      break;
2489
2490   case TGSI_OPCODE_COS:
2491      FETCH(&r[0], 0, CHAN_X);
2492
2493      micro_cos( &r[0], &r[0] );
2494
2495      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2496         STORE( &r[0], 0, chan_index );
2497      }
2498      break;
2499
2500   case TGSI_OPCODE_DDX:
2501      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2502         FETCH( &r[0], 0, chan_index );
2503         micro_ddx(&d[chan_index], &r[0]);
2504      }
2505      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2506         STORE(&d[chan_index], 0, chan_index);
2507      }
2508      break;
2509
2510   case TGSI_OPCODE_DDY:
2511      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2512         FETCH( &r[0], 0, chan_index );
2513         micro_ddy(&d[chan_index], &r[0]);
2514      }
2515      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2516         STORE(&d[chan_index], 0, chan_index);
2517      }
2518      break;
2519
2520   case TGSI_OPCODE_KILP:
2521      exec_kilp (mach, inst);
2522      break;
2523
2524   case TGSI_OPCODE_KIL:
2525      exec_kil (mach, inst);
2526      break;
2527
2528   case TGSI_OPCODE_PK2H:
2529      assert (0);
2530      break;
2531
2532   case TGSI_OPCODE_PK2US:
2533      assert (0);
2534      break;
2535
2536   case TGSI_OPCODE_PK4B:
2537      assert (0);
2538      break;
2539
2540   case TGSI_OPCODE_PK4UB:
2541      assert (0);
2542      break;
2543
2544   case TGSI_OPCODE_RFL:
2545      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2546          IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2547          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2548         /* r0 = dp3(src0, src0) */
2549         FETCH(&r[2], 0, CHAN_X);
2550         micro_mul(&r[0], &r[2], &r[2]);
2551         FETCH(&r[4], 0, CHAN_Y);
2552         micro_mul(&r[8], &r[4], &r[4]);
2553         micro_add(&r[0], &r[0], &r[8]);
2554         FETCH(&r[6], 0, CHAN_Z);
2555         micro_mul(&r[8], &r[6], &r[6]);
2556         micro_add(&r[0], &r[0], &r[8]);
2557
2558         /* r1 = dp3(src0, src1) */
2559         FETCH(&r[3], 1, CHAN_X);
2560         micro_mul(&r[1], &r[2], &r[3]);
2561         FETCH(&r[5], 1, CHAN_Y);
2562         micro_mul(&r[8], &r[4], &r[5]);
2563         micro_add(&r[1], &r[1], &r[8]);
2564         FETCH(&r[7], 1, CHAN_Z);
2565         micro_mul(&r[8], &r[6], &r[7]);
2566         micro_add(&r[1], &r[1], &r[8]);
2567
2568         /* r1 = 2 * r1 / r0 */
2569         micro_add(&r[1], &r[1], &r[1]);
2570         micro_div(&r[1], &r[1], &r[0]);
2571
2572         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2573            micro_mul(&r[2], &r[2], &r[1]);
2574            micro_sub(&r[2], &r[2], &r[3]);
2575            STORE(&r[2], 0, CHAN_X);
2576         }
2577         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2578            micro_mul(&r[4], &r[4], &r[1]);
2579            micro_sub(&r[4], &r[4], &r[5]);
2580            STORE(&r[4], 0, CHAN_Y);
2581         }
2582         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2583            micro_mul(&r[6], &r[6], &r[1]);
2584            micro_sub(&r[6], &r[6], &r[7]);
2585            STORE(&r[6], 0, CHAN_Z);
2586         }
2587      }
2588      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2589         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2590      }
2591      break;
2592
2593   case TGSI_OPCODE_SEQ:
2594      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2595         FETCH( &r[0], 0, chan_index );
2596         FETCH( &r[1], 1, chan_index );
2597         micro_eq(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2598      }
2599      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2600         STORE(&d[chan_index], 0, chan_index);
2601      }
2602      break;
2603
2604   case TGSI_OPCODE_SFL:
2605      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2606         STORE(&mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, chan_index);
2607      }
2608      break;
2609
2610   case TGSI_OPCODE_SGT:
2611      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2612         FETCH( &r[0], 0, chan_index );
2613         FETCH( &r[1], 1, chan_index );
2614         micro_le(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
2615      }
2616      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2617         STORE(&d[chan_index], 0, chan_index);
2618      }
2619      break;
2620
2621   case TGSI_OPCODE_SIN:
2622      FETCH( &r[0], 0, CHAN_X );
2623      micro_sin( &r[0], &r[0] );
2624      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2625         STORE( &r[0], 0, chan_index );
2626      }
2627      break;
2628
2629   case TGSI_OPCODE_SLE:
2630      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2631         FETCH( &r[0], 0, chan_index );
2632         FETCH( &r[1], 1, chan_index );
2633         micro_le(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2634      }
2635      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2636         STORE(&d[chan_index], 0, chan_index);
2637      }
2638      break;
2639
2640   case TGSI_OPCODE_SNE:
2641      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2642         FETCH( &r[0], 0, chan_index );
2643         FETCH( &r[1], 1, chan_index );
2644         micro_eq(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
2645      }
2646      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2647         STORE(&d[chan_index], 0, chan_index);
2648      }
2649      break;
2650
2651   case TGSI_OPCODE_STR:
2652      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2653         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, chan_index);
2654      }
2655      break;
2656
2657   case TGSI_OPCODE_TEX:
2658      /* simple texture lookup */
2659      /* src[0] = texcoord */
2660      /* src[1] = sampler unit */
2661      exec_tex(mach, inst, FALSE, FALSE);
2662      break;
2663
2664   case TGSI_OPCODE_TXB:
2665      /* Texture lookup with lod bias */
2666      /* src[0] = texcoord (src[0].w = LOD bias) */
2667      /* src[1] = sampler unit */
2668      exec_tex(mach, inst, TRUE, FALSE);
2669      break;
2670
2671   case TGSI_OPCODE_TXD:
2672      /* Texture lookup with explict partial derivatives */
2673      /* src[0] = texcoord */
2674      /* src[1] = d[strq]/dx */
2675      /* src[2] = d[strq]/dy */
2676      /* src[3] = sampler unit */
2677      exec_txd(mach, inst);
2678      break;
2679
2680   case TGSI_OPCODE_TXL:
2681      /* Texture lookup with explit LOD */
2682      /* src[0] = texcoord (src[0].w = LOD) */
2683      /* src[1] = sampler unit */
2684      exec_tex(mach, inst, TRUE, FALSE);
2685      break;
2686
2687   case TGSI_OPCODE_TXP:
2688      /* Texture lookup with projection */
2689      /* src[0] = texcoord (src[0].w = projection) */
2690      /* src[1] = sampler unit */
2691      exec_tex(mach, inst, FALSE, TRUE);
2692      break;
2693
2694   case TGSI_OPCODE_UP2H:
2695      assert (0);
2696      break;
2697
2698   case TGSI_OPCODE_UP2US:
2699      assert (0);
2700      break;
2701
2702   case TGSI_OPCODE_UP4B:
2703      assert (0);
2704      break;
2705
2706   case TGSI_OPCODE_UP4UB:
2707      assert (0);
2708      break;
2709
2710   case TGSI_OPCODE_X2D:
2711      FETCH(&r[0], 1, CHAN_X);
2712      FETCH(&r[1], 1, CHAN_Y);
2713      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2714          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2715         FETCH(&r[2], 2, CHAN_X);
2716         micro_mul(&r[2], &r[2], &r[0]);
2717         FETCH(&r[3], 2, CHAN_Y);
2718         micro_mul(&r[3], &r[3], &r[1]);
2719         micro_add(&r[2], &r[2], &r[3]);
2720         FETCH(&r[3], 0, CHAN_X);
2721         micro_add(&d[CHAN_X], &r[2], &r[3]);
2722
2723      }
2724      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2725          IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2726         FETCH(&r[2], 2, CHAN_Z);
2727         micro_mul(&r[2], &r[2], &r[0]);
2728         FETCH(&r[3], 2, CHAN_W);
2729         micro_mul(&r[3], &r[3], &r[1]);
2730         micro_add(&r[2], &r[2], &r[3]);
2731         FETCH(&r[3], 0, CHAN_Y);
2732         micro_add(&d[CHAN_Y], &r[2], &r[3]);
2733
2734      }
2735      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2736         STORE(&d[CHAN_X], 0, CHAN_X);
2737      }
2738      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2739         STORE(&d[CHAN_Y], 0, CHAN_Y);
2740      }
2741      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2742         STORE(&d[CHAN_X], 0, CHAN_Z);
2743      }
2744      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2745         STORE(&d[CHAN_Y], 0, CHAN_W);
2746      }
2747      break;
2748
2749   case TGSI_OPCODE_ARA:
2750      assert (0);
2751      break;
2752
2753   case TGSI_OPCODE_BRA:
2754      assert (0);
2755      break;
2756
2757   case TGSI_OPCODE_CAL:
2758      /* skip the call if no execution channels are enabled */
2759      if (mach->ExecMask) {
2760         /* do the call */
2761
2762         /* First, record the depths of the execution stacks.
2763          * This is important for deeply nested/looped return statements.
2764          * We have to unwind the stacks by the correct amount.  For a
2765          * real code generator, we could determine the number of entries
2766          * to pop off each stack with simple static analysis and avoid
2767          * implementing this data structure at run time.
2768          */
2769         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
2770         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
2771         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
2772         /* note that PC was already incremented above */
2773         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
2774
2775         mach->CallStackTop++;
2776
2777         /* Second, push the Cond, Loop, Cont, Func stacks */
2778         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2779         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2780         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2781         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2782         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2783         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2784         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
2785         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
2786
2787         /* Finally, jump to the subroutine */
2788         *pc = inst->Label.Label;
2789      }
2790      break;
2791
2792   case TGSI_OPCODE_RET:
2793      mach->FuncMask &= ~mach->ExecMask;
2794      UPDATE_EXEC_MASK(mach);
2795
2796      if (mach->FuncMask == 0x0) {
2797         /* really return now (otherwise, keep executing */
2798
2799         if (mach->CallStackTop == 0) {
2800            /* returning from main() */
2801            *pc = -1;
2802            return;
2803         }
2804
2805         assert(mach->CallStackTop > 0);
2806         mach->CallStackTop--;
2807
2808         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
2809         mach->CondMask = mach->CondStack[mach->CondStackTop];
2810
2811         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
2812         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
2813
2814         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
2815         mach->ContMask = mach->ContStack[mach->ContStackTop];
2816
2817         assert(mach->FuncStackTop > 0);
2818         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
2819
2820         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
2821
2822         UPDATE_EXEC_MASK(mach);
2823      }
2824      break;
2825
2826   case TGSI_OPCODE_SSG:
2827   /* TGSI_OPCODE_SGN */
2828      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2829         FETCH( &r[0], 0, chan_index );
2830         micro_sgn(&d[chan_index], &r[0]);
2831      }
2832      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2833         STORE(&d[chan_index], 0, chan_index);
2834      }
2835      break;
2836
2837   case TGSI_OPCODE_CMP:
2838      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2839         FETCH(&r[0], 0, chan_index);
2840         FETCH(&r[1], 1, chan_index);
2841         FETCH(&r[2], 2, chan_index);
2842         micro_lt(&d[chan_index], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2]);
2843      }
2844      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2845         STORE(&d[chan_index], 0, chan_index);
2846      }
2847      break;
2848
2849   case TGSI_OPCODE_SCS:
2850      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2851         FETCH( &r[0], 0, CHAN_X );
2852         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2853            micro_cos(&r[1], &r[0]);
2854            STORE(&r[1], 0, CHAN_X);
2855         }
2856         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2857            micro_sin(&r[1], &r[0]);
2858            STORE(&r[1], 0, CHAN_Y);
2859         }
2860      }
2861      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2862         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
2863      }
2864      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
2865         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2866      }
2867      break;
2868
2869   case TGSI_OPCODE_NRM:
2870      /* 3-component vector normalize */
2871      if(IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2872         IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2873         IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2874         /* r3 = sqrt(dp3(src0, src0)) */
2875         FETCH(&r[0], 0, CHAN_X);
2876         micro_mul(&r[3], &r[0], &r[0]);
2877         FETCH(&r[1], 0, CHAN_Y);
2878         micro_mul(&r[4], &r[1], &r[1]);
2879         micro_add(&r[3], &r[3], &r[4]);
2880         FETCH(&r[2], 0, CHAN_Z);
2881         micro_mul(&r[4], &r[2], &r[2]);
2882         micro_add(&r[3], &r[3], &r[4]);
2883         micro_sqrt(&r[3], &r[3]);
2884
2885         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2886            micro_div(&r[0], &r[0], &r[3]);
2887            STORE(&r[0], 0, CHAN_X);
2888         }
2889         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2890            micro_div(&r[1], &r[1], &r[3]);
2891            STORE(&r[1], 0, CHAN_Y);
2892         }
2893         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2894            micro_div(&r[2], &r[2], &r[3]);
2895            STORE(&r[2], 0, CHAN_Z);
2896         }
2897      }
2898      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2899         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2900      }
2901      break;
2902
2903   case TGSI_OPCODE_NRM4:
2904      /* 4-component vector normalize */
2905      {
2906         union tgsi_exec_channel tmp, dot;
2907
2908         /* tmp = dp4(src0, src0): */
2909         FETCH( &r[0], 0, CHAN_X );
2910         micro_mul( &tmp, &r[0], &r[0] );
2911
2912         FETCH( &r[1], 0, CHAN_Y );
2913         micro_mul( &dot, &r[1], &r[1] );
2914         micro_add( &tmp, &tmp, &dot );
2915
2916         FETCH( &r[2], 0, CHAN_Z );
2917         micro_mul( &dot, &r[2], &r[2] );
2918         micro_add( &tmp, &tmp, &dot );
2919
2920         FETCH( &r[3], 0, CHAN_W );
2921         micro_mul( &dot, &r[3], &r[3] );
2922         micro_add( &tmp, &tmp, &dot );
2923
2924         /* tmp = 1 / sqrt(tmp) */
2925         micro_sqrt( &tmp, &tmp );
2926         micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
2927
2928         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2929            /* chan = chan * tmp */
2930            micro_mul( &r[chan_index], &tmp, &r[chan_index] );
2931            STORE( &r[chan_index], 0, chan_index );
2932         }
2933      }
2934      break;
2935
2936   case TGSI_OPCODE_DIV:
2937      assert( 0 );
2938      break;
2939
2940   case TGSI_OPCODE_DP2:
2941      FETCH( &r[0], 0, CHAN_X );
2942      FETCH( &r[1], 1, CHAN_X );
2943      micro_mul( &r[0], &r[0], &r[1] );
2944
2945      FETCH( &r[1], 0, CHAN_Y );
2946      FETCH( &r[2], 1, CHAN_Y );
2947      micro_mul( &r[1], &r[1], &r[2] );
2948      micro_add( &r[0], &r[0], &r[1] );
2949
2950      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2951         STORE( &r[0], 0, chan_index );
2952      }
2953      break;
2954
2955   case TGSI_OPCODE_IF:
2956      /* push CondMask */
2957      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2958      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2959      FETCH( &r[0], 0, CHAN_X );
2960      /* update CondMask */
2961      if( ! r[0].u[0] ) {
2962         mach->CondMask &= ~0x1;
2963      }
2964      if( ! r[0].u[1] ) {
2965         mach->CondMask &= ~0x2;
2966      }
2967      if( ! r[0].u[2] ) {
2968         mach->CondMask &= ~0x4;
2969      }
2970      if( ! r[0].u[3] ) {
2971         mach->CondMask &= ~0x8;
2972      }
2973      UPDATE_EXEC_MASK(mach);
2974      /* Todo: If CondMask==0, jump to ELSE */
2975      break;
2976
2977   case TGSI_OPCODE_ELSE:
2978      /* invert CondMask wrt previous mask */
2979      {
2980         uint prevMask;
2981         assert(mach->CondStackTop > 0);
2982         prevMask = mach->CondStack[mach->CondStackTop - 1];
2983         mach->CondMask = ~mach->CondMask & prevMask;
2984         UPDATE_EXEC_MASK(mach);
2985         /* Todo: If CondMask==0, jump to ENDIF */
2986      }
2987      break;
2988
2989   case TGSI_OPCODE_ENDIF:
2990      /* pop CondMask */
2991      assert(mach->CondStackTop > 0);
2992      mach->CondMask = mach->CondStack[--mach->CondStackTop];
2993      UPDATE_EXEC_MASK(mach);
2994      break;
2995
2996   case TGSI_OPCODE_END:
2997      /* halt execution */
2998      *pc = -1;
2999      break;
3000
3001   case TGSI_OPCODE_REP:
3002      assert (0);
3003      break;
3004
3005   case TGSI_OPCODE_ENDREP:
3006       assert (0);
3007       break;
3008
3009   case TGSI_OPCODE_PUSHA:
3010      assert (0);
3011      break;
3012
3013   case TGSI_OPCODE_POPA:
3014      assert (0);
3015      break;
3016
3017   case TGSI_OPCODE_CEIL:
3018      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3019         FETCH( &r[0], 0, chan_index );
3020         micro_ceil(&d[chan_index], &r[0]);
3021      }
3022      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3023         STORE(&d[chan_index], 0, chan_index);
3024      }
3025      break;
3026
3027   case TGSI_OPCODE_I2F:
3028      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3029         FETCH( &r[0], 0, chan_index );
3030         micro_i2f(&d[chan_index], &r[0]);
3031      }
3032      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3033         STORE(&d[chan_index], 0, chan_index);
3034      }
3035      break;
3036
3037   case TGSI_OPCODE_NOT:
3038      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3039         FETCH( &r[0], 0, chan_index );
3040         micro_not(&d[chan_index], &r[0]);
3041      }
3042      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3043         STORE(&d[chan_index], 0, chan_index);
3044      }
3045      break;
3046
3047   case TGSI_OPCODE_TRUNC:
3048      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3049         FETCH( &r[0], 0, chan_index );
3050         micro_trunc(&d[chan_index], &r[0]);
3051      }
3052      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3053         STORE(&d[chan_index], 0, chan_index);
3054      }
3055      break;
3056
3057   case TGSI_OPCODE_SHL:
3058      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3059         FETCH( &r[0], 0, chan_index );
3060         FETCH( &r[1], 1, chan_index );
3061         micro_shl(&d[chan_index], &r[0], &r[1]);
3062      }
3063      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3064         STORE(&d[chan_index], 0, chan_index);
3065      }
3066      break;
3067
3068   case TGSI_OPCODE_SHR:
3069      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3070         FETCH( &r[0], 0, chan_index );
3071         FETCH( &r[1], 1, chan_index );
3072         micro_ishr(&d[chan_index], &r[0], &r[1]);
3073      }
3074      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3075         STORE(&d[chan_index], 0, chan_index);
3076      }
3077      break;
3078
3079   case TGSI_OPCODE_AND:
3080      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3081         FETCH( &r[0], 0, chan_index );
3082         FETCH( &r[1], 1, chan_index );
3083         micro_and(&d[chan_index], &r[0], &r[1]);
3084      }
3085      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3086         STORE(&d[chan_index], 0, chan_index);
3087      }
3088      break;
3089
3090   case TGSI_OPCODE_OR:
3091      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3092         FETCH( &r[0], 0, chan_index );
3093         FETCH( &r[1], 1, chan_index );
3094         micro_or(&d[chan_index], &r[0], &r[1]);
3095      }
3096      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3097         STORE(&d[chan_index], 0, chan_index);
3098      }
3099      break;
3100
3101   case TGSI_OPCODE_MOD:
3102      assert (0);
3103      break;
3104
3105   case TGSI_OPCODE_XOR:
3106      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3107         FETCH( &r[0], 0, chan_index );
3108         FETCH( &r[1], 1, chan_index );
3109         micro_xor(&d[chan_index], &r[0], &r[1]);
3110      }
3111      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3112         STORE(&d[chan_index], 0, chan_index);
3113      }
3114      break;
3115
3116   case TGSI_OPCODE_SAD:
3117      assert (0);
3118      break;
3119
3120   case TGSI_OPCODE_TXF:
3121      assert (0);
3122      break;
3123
3124   case TGSI_OPCODE_TXQ:
3125      assert (0);
3126      break;
3127
3128   case TGSI_OPCODE_EMIT:
3129      emit_vertex(mach);
3130      break;
3131
3132   case TGSI_OPCODE_ENDPRIM:
3133      emit_primitive(mach);
3134      break;
3135
3136   case TGSI_OPCODE_BGNFOR:
3137      assert(mach->LoopCounterStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3138      for (chan_index = 0; chan_index < 3; chan_index++) {
3139         FETCH( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[chan_index], 0, chan_index );
3140      }
3141      ++mach->LoopCounterStackTop;
3142      STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X], 0, CHAN_X);
3143      /* update LoopMask */
3144      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3145         mach->LoopMask &= ~0x1;
3146      }
3147      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3148         mach->LoopMask &= ~0x2;
3149      }
3150      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3151         mach->LoopMask &= ~0x4;
3152      }
3153      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3154         mach->LoopMask &= ~0x8;
3155      }
3156      /* TODO: if mach->LoopMask == 0, jump to end of loop */
3157      UPDATE_EXEC_MASK(mach);
3158      /* fall-through (for now) */
3159   case TGSI_OPCODE_BGNLOOP:
3160      /* push LoopMask and ContMasks */
3161      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3162      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3163      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3164      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3165      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3166      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3167      break;
3168
3169   case TGSI_OPCODE_ENDFOR:
3170      assert(mach->LoopCounterStackTop > 0);
3171      micro_sub(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3172                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3173                &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
3174      /* update LoopMask */
3175      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3176         mach->LoopMask &= ~0x1;
3177      }
3178      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3179         mach->LoopMask &= ~0x2;
3180      }
3181      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3182         mach->LoopMask &= ~0x4;
3183      }
3184      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3185         mach->LoopMask &= ~0x8;
3186      }
3187      micro_add(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3188                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3189                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Z]);
3190      assert(mach->LoopLabelStackTop > 0);
3191      inst = mach->Instructions + mach->LoopLabelStack[mach->LoopLabelStackTop - 1];
3192      STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_X], 0, CHAN_X);
3193      /* Restore ContMask, but don't pop */
3194      assert(mach->ContStackTop > 0);
3195      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3196      UPDATE_EXEC_MASK(mach);
3197      if (mach->ExecMask) {
3198         /* repeat loop: jump to instruction just past BGNLOOP */
3199         assert(mach->LoopLabelStackTop > 0);
3200         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3201      }
3202      else {
3203         /* exit loop: pop LoopMask */
3204         assert(mach->LoopStackTop > 0);
3205         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3206         /* pop ContMask */
3207         assert(mach->ContStackTop > 0);
3208         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3209         assert(mach->LoopLabelStackTop > 0);
3210         --mach->LoopLabelStackTop;
3211         assert(mach->LoopCounterStackTop > 0);
3212         --mach->LoopCounterStackTop;
3213      }
3214      UPDATE_EXEC_MASK(mach);
3215      break;
3216
3217   case TGSI_OPCODE_ENDLOOP:
3218      /* Restore ContMask, but don't pop */
3219      assert(mach->ContStackTop > 0);
3220      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3221      UPDATE_EXEC_MASK(mach);
3222      if (mach->ExecMask) {
3223         /* repeat loop: jump to instruction just past BGNLOOP */
3224         assert(mach->LoopLabelStackTop > 0);
3225         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3226      }
3227      else {
3228         /* exit loop: pop LoopMask */
3229         assert(mach->LoopStackTop > 0);
3230         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3231         /* pop ContMask */
3232         assert(mach->ContStackTop > 0);
3233         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3234         assert(mach->LoopLabelStackTop > 0);
3235         --mach->LoopLabelStackTop;
3236      }
3237      UPDATE_EXEC_MASK(mach);
3238      break;
3239
3240   case TGSI_OPCODE_BRK:
3241      /* turn off loop channels for each enabled exec channel */
3242      mach->LoopMask &= ~mach->ExecMask;
3243      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3244      UPDATE_EXEC_MASK(mach);
3245      break;
3246
3247   case TGSI_OPCODE_CONT:
3248      /* turn off cont channels for each enabled exec channel */
3249      mach->ContMask &= ~mach->ExecMask;
3250      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3251      UPDATE_EXEC_MASK(mach);
3252      break;
3253
3254   case TGSI_OPCODE_BGNSUB:
3255      /* no-op */
3256      break;
3257
3258   case TGSI_OPCODE_ENDSUB:
3259      /*
3260       * XXX: This really should be a no-op. We should never reach this opcode.
3261       */
3262
3263      assert(mach->CallStackTop > 0);
3264      mach->CallStackTop--;
3265
3266      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3267      mach->CondMask = mach->CondStack[mach->CondStackTop];
3268
3269      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3270      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3271
3272      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3273      mach->ContMask = mach->ContStack[mach->ContStackTop];
3274
3275      assert(mach->FuncStackTop > 0);
3276      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3277
3278      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3279
3280      UPDATE_EXEC_MASK(mach);
3281      break;
3282
3283   case TGSI_OPCODE_NOP:
3284      break;
3285
3286   case TGSI_OPCODE_BREAKC:
3287      FETCH(&r[0], 0, CHAN_X);
3288      /* update CondMask */
3289      if (r[0].u[0] && (mach->ExecMask & 0x1)) {
3290         mach->LoopMask &= ~0x1;
3291      }
3292      if (r[0].u[1] && (mach->ExecMask & 0x2)) {
3293         mach->LoopMask &= ~0x2;
3294      }
3295      if (r[0].u[2] && (mach->ExecMask & 0x4)) {
3296         mach->LoopMask &= ~0x4;
3297      }
3298      if (r[0].u[3] && (mach->ExecMask & 0x8)) {
3299         mach->LoopMask &= ~0x8;
3300      }
3301      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3302      UPDATE_EXEC_MASK(mach);
3303      break;
3304
3305   default:
3306      assert( 0 );
3307   }
3308}
3309
3310#define DEBUG_EXECUTION 0
3311
3312
3313/**
3314 * Run TGSI interpreter.
3315 * \return bitmask of "alive" quad components
3316 */
3317uint
3318tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
3319{
3320   uint i;
3321   int pc = 0;
3322
3323   mach->CondMask = 0xf;
3324   mach->LoopMask = 0xf;
3325   mach->ContMask = 0xf;
3326   mach->FuncMask = 0xf;
3327   mach->ExecMask = 0xf;
3328
3329   assert(mach->CondStackTop == 0);
3330   assert(mach->LoopStackTop == 0);
3331   assert(mach->ContStackTop == 0);
3332   assert(mach->CallStackTop == 0);
3333
3334   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
3335   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
3336
3337   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
3338      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
3339      mach->Primitives[0] = 0;
3340   }
3341
3342   for (i = 0; i < QUAD_SIZE; i++) {
3343      mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
3344         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
3345         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
3346         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
3347         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
3348   }
3349
3350   /* execute declarations (interpolants) */
3351   for (i = 0; i < mach->NumDeclarations; i++) {
3352      exec_declaration( mach, mach->Declarations+i );
3353   }
3354
3355   {
3356#if DEBUG_EXECUTION
3357      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
3358      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
3359      uint inst = 1;
3360
3361      memcpy(temps, mach->Temps, sizeof(temps));
3362      memcpy(outputs, mach->Outputs, sizeof(outputs));
3363#endif
3364
3365      /* execute instructions, until pc is set to -1 */
3366      while (pc != -1) {
3367
3368#if DEBUG_EXECUTION
3369         uint i;
3370
3371         tgsi_dump_instruction(&mach->Instructions[pc], inst++);
3372#endif
3373
3374         assert(pc < (int) mach->NumInstructions);
3375         exec_instruction(mach, mach->Instructions + pc, &pc);
3376
3377#if DEBUG_EXECUTION
3378         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
3379            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
3380               uint j;
3381
3382               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
3383               debug_printf("TEMP[%2u] = ", i);
3384               for (j = 0; j < 4; j++) {
3385                  if (j > 0) {
3386                     debug_printf("           ");
3387                  }
3388                  debug_printf("(%6f, %6f, %6f, %6f)\n",
3389                               temps[i].xyzw[0].f[j],
3390                               temps[i].xyzw[1].f[j],
3391                               temps[i].xyzw[2].f[j],
3392                               temps[i].xyzw[3].f[j]);
3393               }
3394            }
3395         }
3396         for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
3397            if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
3398               uint j;
3399
3400               memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
3401               debug_printf("OUT[%2u] =  ", i);
3402               for (j = 0; j < 4; j++) {
3403                  if (j > 0) {
3404                     debug_printf("           ");
3405                  }
3406                  debug_printf("{%6f, %6f, %6f, %6f}\n",
3407                               outputs[i].xyzw[0].f[j],
3408                               outputs[i].xyzw[1].f[j],
3409                               outputs[i].xyzw[2].f[j],
3410                               outputs[i].xyzw[3].f[j]);
3411               }
3412            }
3413         }
3414#endif
3415      }
3416   }
3417
3418#if 0
3419   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
3420   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
3421      /*
3422       * Scale back depth component.
3423       */
3424      for (i = 0; i < 4; i++)
3425         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
3426   }
3427#endif
3428
3429   assert(mach->CondStackTop == 0);
3430   assert(mach->LoopStackTop == 0);
3431   assert(mach->ContStackTop == 0);
3432   assert(mach->CallStackTop == 0);
3433
3434   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3435}
3436