tgsi_exec.c revision 3ff688ea299581e60caf5d6e1a464f68c717fe83
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28/**
29 * TGSI interpreter/executor.
30 *
31 * Flow control information:
32 *
33 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
34 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
35 * care since a condition may be true for some quad components but false
36 * for other components.
37 *
38 * We basically execute all statements (even if they're in the part of
39 * an IF/ELSE clause that's "not taken") and use a special mask to
40 * control writing to destination registers.  This is the ExecMask.
41 * See store_dest().
42 *
43 * The ExecMask is computed from three other masks (CondMask, LoopMask and
44 * ContMask) which are controlled by the flow control instructions (namely:
45 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
46 *
47 *
48 * Authors:
49 *   Michal Krol
50 *   Brian Paul
51 */
52
53#include "pipe/p_compiler.h"
54#include "pipe/p_state.h"
55#include "pipe/p_shader_tokens.h"
56#include "tgsi/tgsi_dump.h"
57#include "tgsi/tgsi_parse.h"
58#include "tgsi/tgsi_util.h"
59#include "tgsi_exec.h"
60#include "util/u_memory.h"
61#include "util/u_math.h"
62
63#define FAST_MATH 1
64
65#define TILE_TOP_LEFT     0
66#define TILE_TOP_RIGHT    1
67#define TILE_BOTTOM_LEFT  2
68#define TILE_BOTTOM_RIGHT 3
69
70#define CHAN_X  0
71#define CHAN_Y  1
72#define CHAN_Z  2
73#define CHAN_W  3
74
75/*
76 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
77 */
78#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
79#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
80#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
81#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
82#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
83#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
84#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
85#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
86#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
87#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
88#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
89#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
90#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
91#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
92#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
93#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
94#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
95#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
96#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
97#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
98#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
99#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
100#define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
101#define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
102#define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
103#define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
104#define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
105#define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
106#define TEMP_R0            TGSI_EXEC_TEMP_R0
107#define TEMP_P0            TGSI_EXEC_TEMP_P0
108
109#define IS_CHANNEL_ENABLED(INST, CHAN)\
110   ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
111
112#define IS_CHANNEL_ENABLED2(INST, CHAN)\
113   ((INST).Dst[1].Register.WriteMask & (1 << (CHAN)))
114
115#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
116   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
117      if (IS_CHANNEL_ENABLED( INST, CHAN ))
118
119#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
120   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
121      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
122
123
124/** The execution mask depends on the conditional mask and the loop mask */
125#define UPDATE_EXEC_MASK(MACH) \
126      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
127
128
129static const union tgsi_exec_channel ZeroVec =
130   { { 0.0, 0.0, 0.0, 0.0 } };
131
132
133#ifdef DEBUG
134static void
135check_inf_or_nan(const union tgsi_exec_channel *chan)
136{
137   assert(!util_is_inf_or_nan(chan->f[0]));
138   assert(!util_is_inf_or_nan(chan->f[1]));
139   assert(!util_is_inf_or_nan(chan->f[2]));
140   assert(!util_is_inf_or_nan(chan->f[3]));
141}
142#endif
143
144
145#ifdef DEBUG
146static void
147print_chan(const char *msg, const union tgsi_exec_channel *chan)
148{
149   debug_printf("%s = {%f, %f, %f, %f}\n",
150                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
151}
152#endif
153
154
155#ifdef DEBUG
156static void
157print_temp(const struct tgsi_exec_machine *mach, uint index)
158{
159   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
160   int i;
161   debug_printf("Temp[%u] =\n", index);
162   for (i = 0; i < 4; i++) {
163      debug_printf("  %c: { %f, %f, %f, %f }\n",
164                   "XYZW"[i],
165                   tmp->xyzw[i].f[0],
166                   tmp->xyzw[i].f[1],
167                   tmp->xyzw[i].f[2],
168                   tmp->xyzw[i].f[3]);
169   }
170}
171#endif
172
173
174/**
175 * Check if there's a potential src/dst register data dependency when
176 * using SOA execution.
177 * Example:
178 *   MOV T, T.yxwz;
179 * This would expand into:
180 *   MOV t0, t1;
181 *   MOV t1, t0;
182 *   MOV t2, t3;
183 *   MOV t3, t2;
184 * The second instruction will have the wrong value for t0 if executed as-is.
185 */
186boolean
187tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
188{
189   uint i, chan;
190
191   uint writemask = inst->Dst[0].Register.WriteMask;
192   if (writemask == TGSI_WRITEMASK_X ||
193       writemask == TGSI_WRITEMASK_Y ||
194       writemask == TGSI_WRITEMASK_Z ||
195       writemask == TGSI_WRITEMASK_W ||
196       writemask == TGSI_WRITEMASK_NONE) {
197      /* no chance of data dependency */
198      return FALSE;
199   }
200
201   /* loop over src regs */
202   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
203      if ((inst->Src[i].Register.File ==
204           inst->Dst[0].Register.File) &&
205          (inst->Src[i].Register.Index ==
206           inst->Dst[0].Register.Index)) {
207         /* loop over dest channels */
208         uint channelsWritten = 0x0;
209         FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
210            /* check if we're reading a channel that's been written */
211            uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
212            if (channelsWritten & (1 << swizzle)) {
213               return TRUE;
214            }
215
216            channelsWritten |= (1 << chan);
217         }
218      }
219   }
220   return FALSE;
221}
222
223
224/**
225 * Initialize machine state by expanding tokens to full instructions,
226 * allocating temporary storage, setting up constants, etc.
227 * After this, we can call tgsi_exec_machine_run() many times.
228 */
229void
230tgsi_exec_machine_bind_shader(
231   struct tgsi_exec_machine *mach,
232   const struct tgsi_token *tokens,
233   uint numSamplers,
234   struct tgsi_sampler **samplers)
235{
236   uint k;
237   struct tgsi_parse_context parse;
238   struct tgsi_exec_labels *labels = &mach->Labels;
239   struct tgsi_full_instruction *instructions;
240   struct tgsi_full_declaration *declarations;
241   uint maxInstructions = 10, numInstructions = 0;
242   uint maxDeclarations = 10, numDeclarations = 0;
243   uint instno = 0;
244
245#if 0
246   tgsi_dump(tokens, 0);
247#endif
248
249   util_init_math();
250
251   mach->Tokens = tokens;
252   mach->Samplers = samplers;
253
254   k = tgsi_parse_init (&parse, mach->Tokens);
255   if (k != TGSI_PARSE_OK) {
256      debug_printf( "Problem parsing!\n" );
257      return;
258   }
259
260   mach->Processor = parse.FullHeader.Processor.Processor;
261   mach->ImmLimit = 0;
262   labels->count = 0;
263
264   declarations = (struct tgsi_full_declaration *)
265      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
266
267   if (!declarations) {
268      return;
269   }
270
271   instructions = (struct tgsi_full_instruction *)
272      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
273
274   if (!instructions) {
275      FREE( declarations );
276      return;
277   }
278
279   while( !tgsi_parse_end_of_tokens( &parse ) ) {
280      uint pointer = parse.Position;
281      uint i;
282
283      tgsi_parse_token( &parse );
284      switch( parse.FullToken.Token.Type ) {
285      case TGSI_TOKEN_TYPE_DECLARATION:
286         /* save expanded declaration */
287         if (numDeclarations == maxDeclarations) {
288            declarations = REALLOC(declarations,
289                                   maxDeclarations
290                                   * sizeof(struct tgsi_full_declaration),
291                                   (maxDeclarations + 10)
292                                   * sizeof(struct tgsi_full_declaration));
293            maxDeclarations += 10;
294         }
295         memcpy(declarations + numDeclarations,
296                &parse.FullToken.FullDeclaration,
297                sizeof(declarations[0]));
298         numDeclarations++;
299         break;
300
301      case TGSI_TOKEN_TYPE_IMMEDIATE:
302         {
303            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
304            assert( size <= 4 );
305            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
306
307            for( i = 0; i < size; i++ ) {
308               mach->Imms[mach->ImmLimit][i] =
309		  parse.FullToken.FullImmediate.u[i].Float;
310            }
311            mach->ImmLimit += 1;
312         }
313         break;
314
315      case TGSI_TOKEN_TYPE_INSTRUCTION:
316         assert( labels->count < MAX_LABELS );
317
318         labels->labels[labels->count][0] = instno;
319         labels->labels[labels->count][1] = pointer;
320         labels->count++;
321
322         /* save expanded instruction */
323         if (numInstructions == maxInstructions) {
324            instructions = REALLOC(instructions,
325                                   maxInstructions
326                                   * sizeof(struct tgsi_full_instruction),
327                                   (maxInstructions + 10)
328                                   * sizeof(struct tgsi_full_instruction));
329            maxInstructions += 10;
330         }
331
332         memcpy(instructions + numInstructions,
333                &parse.FullToken.FullInstruction,
334                sizeof(instructions[0]));
335
336         numInstructions++;
337         break;
338
339      case TGSI_TOKEN_TYPE_PROPERTY:
340         break;
341
342      default:
343         assert( 0 );
344      }
345   }
346   tgsi_parse_free (&parse);
347
348   if (mach->Declarations) {
349      FREE( mach->Declarations );
350   }
351   mach->Declarations = declarations;
352   mach->NumDeclarations = numDeclarations;
353
354   if (mach->Instructions) {
355      FREE( mach->Instructions );
356   }
357   mach->Instructions = instructions;
358   mach->NumInstructions = numInstructions;
359}
360
361
362struct tgsi_exec_machine *
363tgsi_exec_machine_create( void )
364{
365   struct tgsi_exec_machine *mach;
366   uint i;
367
368   mach = align_malloc( sizeof *mach, 16 );
369   if (!mach)
370      goto fail;
371
372   memset(mach, 0, sizeof(*mach));
373
374   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
375   mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
376
377   /* Setup constants. */
378   for( i = 0; i < 4; i++ ) {
379      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
380      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
381      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
382      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
383      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
384      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
385      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
386      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
387      mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
388      mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
389   }
390
391#ifdef DEBUG
392   /* silence warnings */
393   (void) print_chan;
394   (void) print_temp;
395#endif
396
397   return mach;
398
399fail:
400   align_free(mach);
401   return NULL;
402}
403
404
405void
406tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
407{
408   if (mach) {
409      FREE(mach->Instructions);
410      FREE(mach->Declarations);
411   }
412
413   align_free(mach);
414}
415
416
417static void
418micro_abs(
419   union tgsi_exec_channel *dst,
420   const union tgsi_exec_channel *src )
421{
422   dst->f[0] = fabsf( src->f[0] );
423   dst->f[1] = fabsf( src->f[1] );
424   dst->f[2] = fabsf( src->f[2] );
425   dst->f[3] = fabsf( src->f[3] );
426}
427
428static void
429micro_add(
430   union tgsi_exec_channel *dst,
431   const union tgsi_exec_channel *src0,
432   const union tgsi_exec_channel *src1 )
433{
434   dst->f[0] = src0->f[0] + src1->f[0];
435   dst->f[1] = src0->f[1] + src1->f[1];
436   dst->f[2] = src0->f[2] + src1->f[2];
437   dst->f[3] = src0->f[3] + src1->f[3];
438}
439
440#if 0
441static void
442micro_iadd(
443   union tgsi_exec_channel *dst,
444   const union tgsi_exec_channel *src0,
445   const union tgsi_exec_channel *src1 )
446{
447   dst->i[0] = src0->i[0] + src1->i[0];
448   dst->i[1] = src0->i[1] + src1->i[1];
449   dst->i[2] = src0->i[2] + src1->i[2];
450   dst->i[3] = src0->i[3] + src1->i[3];
451}
452#endif
453
454static void
455micro_and(
456   union tgsi_exec_channel *dst,
457   const union tgsi_exec_channel *src0,
458   const union tgsi_exec_channel *src1 )
459{
460   dst->u[0] = src0->u[0] & src1->u[0];
461   dst->u[1] = src0->u[1] & src1->u[1];
462   dst->u[2] = src0->u[2] & src1->u[2];
463   dst->u[3] = src0->u[3] & src1->u[3];
464}
465
466static void
467micro_ceil(
468   union tgsi_exec_channel *dst,
469   const union tgsi_exec_channel *src )
470{
471   dst->f[0] = ceilf( src->f[0] );
472   dst->f[1] = ceilf( src->f[1] );
473   dst->f[2] = ceilf( src->f[2] );
474   dst->f[3] = ceilf( src->f[3] );
475}
476
477static void
478micro_cos(
479   union tgsi_exec_channel *dst,
480   const union tgsi_exec_channel *src )
481{
482   dst->f[0] = cosf( src->f[0] );
483   dst->f[1] = cosf( src->f[1] );
484   dst->f[2] = cosf( src->f[2] );
485   dst->f[3] = cosf( src->f[3] );
486}
487
488static void
489micro_ddx(
490   union tgsi_exec_channel *dst,
491   const union tgsi_exec_channel *src )
492{
493   dst->f[0] =
494   dst->f[1] =
495   dst->f[2] =
496   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
497}
498
499static void
500micro_ddy(
501   union tgsi_exec_channel *dst,
502   const union tgsi_exec_channel *src )
503{
504   dst->f[0] =
505   dst->f[1] =
506   dst->f[2] =
507   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
508}
509
510static void
511micro_div(
512   union tgsi_exec_channel *dst,
513   const union tgsi_exec_channel *src0,
514   const union tgsi_exec_channel *src1 )
515{
516   if (src1->f[0] != 0) {
517      dst->f[0] = src0->f[0] / src1->f[0];
518   }
519   if (src1->f[1] != 0) {
520      dst->f[1] = src0->f[1] / src1->f[1];
521   }
522   if (src1->f[2] != 0) {
523      dst->f[2] = src0->f[2] / src1->f[2];
524   }
525   if (src1->f[3] != 0) {
526      dst->f[3] = src0->f[3] / src1->f[3];
527   }
528}
529
530#if 0
531static void
532micro_udiv(
533   union tgsi_exec_channel *dst,
534   const union tgsi_exec_channel *src0,
535   const union tgsi_exec_channel *src1 )
536{
537   dst->u[0] = src0->u[0] / src1->u[0];
538   dst->u[1] = src0->u[1] / src1->u[1];
539   dst->u[2] = src0->u[2] / src1->u[2];
540   dst->u[3] = src0->u[3] / src1->u[3];
541}
542#endif
543
544static void
545micro_eq(
546   union tgsi_exec_channel *dst,
547   const union tgsi_exec_channel *src0,
548   const union tgsi_exec_channel *src1,
549   const union tgsi_exec_channel *src2,
550   const union tgsi_exec_channel *src3 )
551{
552   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
553   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
554   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
555   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
556}
557
558#if 0
559static void
560micro_ieq(
561   union tgsi_exec_channel *dst,
562   const union tgsi_exec_channel *src0,
563   const union tgsi_exec_channel *src1,
564   const union tgsi_exec_channel *src2,
565   const union tgsi_exec_channel *src3 )
566{
567   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
568   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
569   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
570   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
571}
572#endif
573
574static void
575micro_exp2(
576   union tgsi_exec_channel *dst,
577   const union tgsi_exec_channel *src)
578{
579#if FAST_MATH
580   dst->f[0] = util_fast_exp2( src->f[0] );
581   dst->f[1] = util_fast_exp2( src->f[1] );
582   dst->f[2] = util_fast_exp2( src->f[2] );
583   dst->f[3] = util_fast_exp2( src->f[3] );
584#else
585
586#if DEBUG
587   /* Inf is okay for this instruction, so clamp it to silence assertions. */
588   uint i;
589   union tgsi_exec_channel clamped;
590
591   for (i = 0; i < 4; i++) {
592      if (src->f[i] > 127.99999f) {
593         clamped.f[i] = 127.99999f;
594      } else if (src->f[i] < -126.99999f) {
595         clamped.f[i] = -126.99999f;
596      } else {
597         clamped.f[i] = src->f[i];
598      }
599   }
600   src = &clamped;
601#endif
602
603   dst->f[0] = powf( 2.0f, src->f[0] );
604   dst->f[1] = powf( 2.0f, src->f[1] );
605   dst->f[2] = powf( 2.0f, src->f[2] );
606   dst->f[3] = powf( 2.0f, src->f[3] );
607#endif
608}
609
610#if 0
611static void
612micro_f2ut(
613   union tgsi_exec_channel *dst,
614   const union tgsi_exec_channel *src )
615{
616   dst->u[0] = (uint) src->f[0];
617   dst->u[1] = (uint) src->f[1];
618   dst->u[2] = (uint) src->f[2];
619   dst->u[3] = (uint) src->f[3];
620}
621#endif
622
623static void
624micro_float_clamp(union tgsi_exec_channel *dst,
625                  const union tgsi_exec_channel *src)
626{
627   uint i;
628
629   for (i = 0; i < 4; i++) {
630      if (src->f[i] > 0.0f) {
631         if (src->f[i] > 1.884467e+019f)
632            dst->f[i] = 1.884467e+019f;
633         else if (src->f[i] < 5.42101e-020f)
634            dst->f[i] = 5.42101e-020f;
635         else
636            dst->f[i] = src->f[i];
637      }
638      else {
639         if (src->f[i] < -1.884467e+019f)
640            dst->f[i] = -1.884467e+019f;
641         else if (src->f[i] > -5.42101e-020f)
642            dst->f[i] = -5.42101e-020f;
643         else
644            dst->f[i] = src->f[i];
645      }
646   }
647}
648
649static void
650micro_flr(
651   union tgsi_exec_channel *dst,
652   const union tgsi_exec_channel *src )
653{
654   dst->f[0] = floorf( src->f[0] );
655   dst->f[1] = floorf( src->f[1] );
656   dst->f[2] = floorf( src->f[2] );
657   dst->f[3] = floorf( src->f[3] );
658}
659
660static void
661micro_frc(
662   union tgsi_exec_channel *dst,
663   const union tgsi_exec_channel *src )
664{
665   dst->f[0] = src->f[0] - floorf( src->f[0] );
666   dst->f[1] = src->f[1] - floorf( src->f[1] );
667   dst->f[2] = src->f[2] - floorf( src->f[2] );
668   dst->f[3] = src->f[3] - floorf( src->f[3] );
669}
670
671static void
672micro_i2f(
673   union tgsi_exec_channel *dst,
674   const union tgsi_exec_channel *src )
675{
676   dst->f[0] = (float) src->i[0];
677   dst->f[1] = (float) src->i[1];
678   dst->f[2] = (float) src->i[2];
679   dst->f[3] = (float) src->i[3];
680}
681
682static void
683micro_lg2(
684   union tgsi_exec_channel *dst,
685   const union tgsi_exec_channel *src )
686{
687#if FAST_MATH
688   dst->f[0] = util_fast_log2( src->f[0] );
689   dst->f[1] = util_fast_log2( src->f[1] );
690   dst->f[2] = util_fast_log2( src->f[2] );
691   dst->f[3] = util_fast_log2( src->f[3] );
692#else
693   dst->f[0] = logf( src->f[0] ) * 1.442695f;
694   dst->f[1] = logf( src->f[1] ) * 1.442695f;
695   dst->f[2] = logf( src->f[2] ) * 1.442695f;
696   dst->f[3] = logf( src->f[3] ) * 1.442695f;
697#endif
698}
699
700static void
701micro_le(
702   union tgsi_exec_channel *dst,
703   const union tgsi_exec_channel *src0,
704   const union tgsi_exec_channel *src1,
705   const union tgsi_exec_channel *src2,
706   const union tgsi_exec_channel *src3 )
707{
708   dst->f[0] = src0->f[0] <= src1->f[0] ? src2->f[0] : src3->f[0];
709   dst->f[1] = src0->f[1] <= src1->f[1] ? src2->f[1] : src3->f[1];
710   dst->f[2] = src0->f[2] <= src1->f[2] ? src2->f[2] : src3->f[2];
711   dst->f[3] = src0->f[3] <= src1->f[3] ? src2->f[3] : src3->f[3];
712}
713
714static void
715micro_lt(
716   union tgsi_exec_channel *dst,
717   const union tgsi_exec_channel *src0,
718   const union tgsi_exec_channel *src1,
719   const union tgsi_exec_channel *src2,
720   const union tgsi_exec_channel *src3 )
721{
722   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
723   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
724   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
725   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
726}
727
728#if 0
729static void
730micro_ilt(
731   union tgsi_exec_channel *dst,
732   const union tgsi_exec_channel *src0,
733   const union tgsi_exec_channel *src1,
734   const union tgsi_exec_channel *src2,
735   const union tgsi_exec_channel *src3 )
736{
737   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
738   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
739   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
740   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
741}
742#endif
743
744#if 0
745static void
746micro_ult(
747   union tgsi_exec_channel *dst,
748   const union tgsi_exec_channel *src0,
749   const union tgsi_exec_channel *src1,
750   const union tgsi_exec_channel *src2,
751   const union tgsi_exec_channel *src3 )
752{
753   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
754   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
755   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
756   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
757}
758#endif
759
760static void
761micro_max(
762   union tgsi_exec_channel *dst,
763   const union tgsi_exec_channel *src0,
764   const union tgsi_exec_channel *src1 )
765{
766   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
767   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
768   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
769   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
770}
771
772#if 0
773static void
774micro_imax(
775   union tgsi_exec_channel *dst,
776   const union tgsi_exec_channel *src0,
777   const union tgsi_exec_channel *src1 )
778{
779   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
780   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
781   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
782   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
783}
784#endif
785
786#if 0
787static void
788micro_umax(
789   union tgsi_exec_channel *dst,
790   const union tgsi_exec_channel *src0,
791   const union tgsi_exec_channel *src1 )
792{
793   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
794   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
795   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
796   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
797}
798#endif
799
800static void
801micro_min(
802   union tgsi_exec_channel *dst,
803   const union tgsi_exec_channel *src0,
804   const union tgsi_exec_channel *src1 )
805{
806   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
807   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
808   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
809   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
810}
811
812#if 0
813static void
814micro_imin(
815   union tgsi_exec_channel *dst,
816   const union tgsi_exec_channel *src0,
817   const union tgsi_exec_channel *src1 )
818{
819   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
820   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
821   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
822   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
823}
824#endif
825
826#if 0
827static void
828micro_umin(
829   union tgsi_exec_channel *dst,
830   const union tgsi_exec_channel *src0,
831   const union tgsi_exec_channel *src1 )
832{
833   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
834   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
835   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
836   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
837}
838#endif
839
840#if 0
841static void
842micro_umod(
843   union tgsi_exec_channel *dst,
844   const union tgsi_exec_channel *src0,
845   const union tgsi_exec_channel *src1 )
846{
847   dst->u[0] = src0->u[0] % src1->u[0];
848   dst->u[1] = src0->u[1] % src1->u[1];
849   dst->u[2] = src0->u[2] % src1->u[2];
850   dst->u[3] = src0->u[3] % src1->u[3];
851}
852#endif
853
854static void
855micro_mul(
856   union tgsi_exec_channel *dst,
857   const union tgsi_exec_channel *src0,
858   const union tgsi_exec_channel *src1 )
859{
860   dst->f[0] = src0->f[0] * src1->f[0];
861   dst->f[1] = src0->f[1] * src1->f[1];
862   dst->f[2] = src0->f[2] * src1->f[2];
863   dst->f[3] = src0->f[3] * src1->f[3];
864}
865
866#if 0
867static void
868micro_imul(
869   union tgsi_exec_channel *dst,
870   const union tgsi_exec_channel *src0,
871   const union tgsi_exec_channel *src1 )
872{
873   dst->i[0] = src0->i[0] * src1->i[0];
874   dst->i[1] = src0->i[1] * src1->i[1];
875   dst->i[2] = src0->i[2] * src1->i[2];
876   dst->i[3] = src0->i[3] * src1->i[3];
877}
878#endif
879
880#if 0
881static void
882micro_imul64(
883   union tgsi_exec_channel *dst0,
884   union tgsi_exec_channel *dst1,
885   const union tgsi_exec_channel *src0,
886   const union tgsi_exec_channel *src1 )
887{
888   dst1->i[0] = src0->i[0] * src1->i[0];
889   dst1->i[1] = src0->i[1] * src1->i[1];
890   dst1->i[2] = src0->i[2] * src1->i[2];
891   dst1->i[3] = src0->i[3] * src1->i[3];
892   dst0->i[0] = 0;
893   dst0->i[1] = 0;
894   dst0->i[2] = 0;
895   dst0->i[3] = 0;
896}
897#endif
898
899#if 0
900static void
901micro_umul64(
902   union tgsi_exec_channel *dst0,
903   union tgsi_exec_channel *dst1,
904   const union tgsi_exec_channel *src0,
905   const union tgsi_exec_channel *src1 )
906{
907   dst1->u[0] = src0->u[0] * src1->u[0];
908   dst1->u[1] = src0->u[1] * src1->u[1];
909   dst1->u[2] = src0->u[2] * src1->u[2];
910   dst1->u[3] = src0->u[3] * src1->u[3];
911   dst0->u[0] = 0;
912   dst0->u[1] = 0;
913   dst0->u[2] = 0;
914   dst0->u[3] = 0;
915}
916#endif
917
918
919#if 0
920static void
921micro_movc(
922   union tgsi_exec_channel *dst,
923   const union tgsi_exec_channel *src0,
924   const union tgsi_exec_channel *src1,
925   const union tgsi_exec_channel *src2 )
926{
927   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
928   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
929   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
930   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
931}
932#endif
933
934static void
935micro_neg(
936   union tgsi_exec_channel *dst,
937   const union tgsi_exec_channel *src )
938{
939   dst->f[0] = -src->f[0];
940   dst->f[1] = -src->f[1];
941   dst->f[2] = -src->f[2];
942   dst->f[3] = -src->f[3];
943}
944
945#if 0
946static void
947micro_ineg(
948   union tgsi_exec_channel *dst,
949   const union tgsi_exec_channel *src )
950{
951   dst->i[0] = -src->i[0];
952   dst->i[1] = -src->i[1];
953   dst->i[2] = -src->i[2];
954   dst->i[3] = -src->i[3];
955}
956#endif
957
958static void
959micro_not(
960   union tgsi_exec_channel *dst,
961   const union tgsi_exec_channel *src )
962{
963   dst->u[0] = ~src->u[0];
964   dst->u[1] = ~src->u[1];
965   dst->u[2] = ~src->u[2];
966   dst->u[3] = ~src->u[3];
967}
968
969static void
970micro_or(
971   union tgsi_exec_channel *dst,
972   const union tgsi_exec_channel *src0,
973   const union tgsi_exec_channel *src1 )
974{
975   dst->u[0] = src0->u[0] | src1->u[0];
976   dst->u[1] = src0->u[1] | src1->u[1];
977   dst->u[2] = src0->u[2] | src1->u[2];
978   dst->u[3] = src0->u[3] | src1->u[3];
979}
980
981static void
982micro_pow(
983   union tgsi_exec_channel *dst,
984   const union tgsi_exec_channel *src0,
985   const union tgsi_exec_channel *src1 )
986{
987#if FAST_MATH
988   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
989   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
990   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
991   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
992#else
993   dst->f[0] = powf( src0->f[0], src1->f[0] );
994   dst->f[1] = powf( src0->f[1], src1->f[1] );
995   dst->f[2] = powf( src0->f[2], src1->f[2] );
996   dst->f[3] = powf( src0->f[3], src1->f[3] );
997#endif
998}
999
1000static void
1001micro_rnd(
1002   union tgsi_exec_channel *dst,
1003   const union tgsi_exec_channel *src )
1004{
1005   dst->f[0] = floorf( src->f[0] + 0.5f );
1006   dst->f[1] = floorf( src->f[1] + 0.5f );
1007   dst->f[2] = floorf( src->f[2] + 0.5f );
1008   dst->f[3] = floorf( src->f[3] + 0.5f );
1009}
1010
1011static void
1012micro_sgn(
1013   union tgsi_exec_channel *dst,
1014   const union tgsi_exec_channel *src )
1015{
1016   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
1017   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
1018   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
1019   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
1020}
1021
1022static void
1023micro_shl(
1024   union tgsi_exec_channel *dst,
1025   const union tgsi_exec_channel *src0,
1026   const union tgsi_exec_channel *src1 )
1027{
1028   dst->i[0] = src0->i[0] << src1->i[0];
1029   dst->i[1] = src0->i[1] << src1->i[1];
1030   dst->i[2] = src0->i[2] << src1->i[2];
1031   dst->i[3] = src0->i[3] << src1->i[3];
1032}
1033
1034static void
1035micro_ishr(
1036   union tgsi_exec_channel *dst,
1037   const union tgsi_exec_channel *src0,
1038   const union tgsi_exec_channel *src1 )
1039{
1040   dst->i[0] = src0->i[0] >> src1->i[0];
1041   dst->i[1] = src0->i[1] >> src1->i[1];
1042   dst->i[2] = src0->i[2] >> src1->i[2];
1043   dst->i[3] = src0->i[3] >> src1->i[3];
1044}
1045
1046static void
1047micro_trunc(
1048   union tgsi_exec_channel *dst,
1049   const union tgsi_exec_channel *src0 )
1050{
1051   dst->f[0] = (float) (int) src0->f[0];
1052   dst->f[1] = (float) (int) src0->f[1];
1053   dst->f[2] = (float) (int) src0->f[2];
1054   dst->f[3] = (float) (int) src0->f[3];
1055}
1056
1057#if 0
1058static void
1059micro_ushr(
1060   union tgsi_exec_channel *dst,
1061   const union tgsi_exec_channel *src0,
1062   const union tgsi_exec_channel *src1 )
1063{
1064   dst->u[0] = src0->u[0] >> src1->u[0];
1065   dst->u[1] = src0->u[1] >> src1->u[1];
1066   dst->u[2] = src0->u[2] >> src1->u[2];
1067   dst->u[3] = src0->u[3] >> src1->u[3];
1068}
1069#endif
1070
1071static void
1072micro_sin(
1073   union tgsi_exec_channel *dst,
1074   const union tgsi_exec_channel *src )
1075{
1076   dst->f[0] = sinf( src->f[0] );
1077   dst->f[1] = sinf( src->f[1] );
1078   dst->f[2] = sinf( src->f[2] );
1079   dst->f[3] = sinf( src->f[3] );
1080}
1081
1082static void
1083micro_sqrt( union tgsi_exec_channel *dst,
1084            const union tgsi_exec_channel *src )
1085{
1086   dst->f[0] = sqrtf( src->f[0] );
1087   dst->f[1] = sqrtf( src->f[1] );
1088   dst->f[2] = sqrtf( src->f[2] );
1089   dst->f[3] = sqrtf( src->f[3] );
1090}
1091
1092static void
1093micro_sub(
1094   union tgsi_exec_channel *dst,
1095   const union tgsi_exec_channel *src0,
1096   const union tgsi_exec_channel *src1 )
1097{
1098   dst->f[0] = src0->f[0] - src1->f[0];
1099   dst->f[1] = src0->f[1] - src1->f[1];
1100   dst->f[2] = src0->f[2] - src1->f[2];
1101   dst->f[3] = src0->f[3] - src1->f[3];
1102}
1103
1104#if 0
1105static void
1106micro_u2f(
1107   union tgsi_exec_channel *dst,
1108   const union tgsi_exec_channel *src )
1109{
1110   dst->f[0] = (float) src->u[0];
1111   dst->f[1] = (float) src->u[1];
1112   dst->f[2] = (float) src->u[2];
1113   dst->f[3] = (float) src->u[3];
1114}
1115#endif
1116
1117static void
1118micro_xor(
1119   union tgsi_exec_channel *dst,
1120   const union tgsi_exec_channel *src0,
1121   const union tgsi_exec_channel *src1 )
1122{
1123   dst->u[0] = src0->u[0] ^ src1->u[0];
1124   dst->u[1] = src0->u[1] ^ src1->u[1];
1125   dst->u[2] = src0->u[2] ^ src1->u[2];
1126   dst->u[3] = src0->u[3] ^ src1->u[3];
1127}
1128
1129static void
1130fetch_src_file_channel(
1131   const struct tgsi_exec_machine *mach,
1132   const uint file,
1133   const uint swizzle,
1134   const union tgsi_exec_channel *index,
1135   union tgsi_exec_channel *chan )
1136{
1137   switch( swizzle ) {
1138   case TGSI_SWIZZLE_X:
1139   case TGSI_SWIZZLE_Y:
1140   case TGSI_SWIZZLE_Z:
1141   case TGSI_SWIZZLE_W:
1142      switch( file ) {
1143      case TGSI_FILE_CONSTANT:
1144         assert(mach->Consts);
1145         if (index->i[0] < 0)
1146            chan->f[0] = 0.0f;
1147         else
1148            chan->f[0] = mach->Consts[index->i[0]][swizzle];
1149         if (index->i[1] < 0)
1150            chan->f[1] = 0.0f;
1151         else
1152            chan->f[1] = mach->Consts[index->i[1]][swizzle];
1153         if (index->i[2] < 0)
1154            chan->f[2] = 0.0f;
1155         else
1156            chan->f[2] = mach->Consts[index->i[2]][swizzle];
1157         if (index->i[3] < 0)
1158            chan->f[3] = 0.0f;
1159         else
1160            chan->f[3] = mach->Consts[index->i[3]][swizzle];
1161         break;
1162
1163      case TGSI_FILE_INPUT:
1164      case TGSI_FILE_SYSTEM_VALUE:
1165         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
1166         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
1167         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
1168         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
1169         break;
1170
1171      case TGSI_FILE_TEMPORARY:
1172         assert(index->i[0] < TGSI_EXEC_NUM_TEMPS);
1173         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
1174         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
1175         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
1176         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
1177         break;
1178
1179      case TGSI_FILE_IMMEDIATE:
1180         assert( index->i[0] < (int) mach->ImmLimit );
1181         chan->f[0] = mach->Imms[index->i[0]][swizzle];
1182         assert( index->i[1] < (int) mach->ImmLimit );
1183         chan->f[1] = mach->Imms[index->i[1]][swizzle];
1184         assert( index->i[2] < (int) mach->ImmLimit );
1185         chan->f[2] = mach->Imms[index->i[2]][swizzle];
1186         assert( index->i[3] < (int) mach->ImmLimit );
1187         chan->f[3] = mach->Imms[index->i[3]][swizzle];
1188         break;
1189
1190      case TGSI_FILE_ADDRESS:
1191         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
1192         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
1193         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
1194         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
1195         break;
1196
1197      case TGSI_FILE_PREDICATE:
1198         assert(index->i[0] < TGSI_EXEC_NUM_PREDS);
1199         assert(index->i[1] < TGSI_EXEC_NUM_PREDS);
1200         assert(index->i[2] < TGSI_EXEC_NUM_PREDS);
1201         assert(index->i[3] < TGSI_EXEC_NUM_PREDS);
1202         chan->u[0] = mach->Predicates[0].xyzw[swizzle].u[0];
1203         chan->u[1] = mach->Predicates[0].xyzw[swizzle].u[1];
1204         chan->u[2] = mach->Predicates[0].xyzw[swizzle].u[2];
1205         chan->u[3] = mach->Predicates[0].xyzw[swizzle].u[3];
1206         break;
1207
1208      case TGSI_FILE_OUTPUT:
1209         /* vertex/fragment output vars can be read too */
1210         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
1211         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
1212         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
1213         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
1214         break;
1215
1216      default:
1217         assert( 0 );
1218      }
1219      break;
1220
1221   default:
1222      assert( 0 );
1223   }
1224}
1225
1226static void
1227fetch_source(
1228   const struct tgsi_exec_machine *mach,
1229   union tgsi_exec_channel *chan,
1230   const struct tgsi_full_src_register *reg,
1231   const uint chan_index )
1232{
1233   union tgsi_exec_channel index;
1234   uint swizzle;
1235
1236   /* We start with a direct index into a register file.
1237    *
1238    *    file[1],
1239    *    where:
1240    *       file = Register.File
1241    *       [1] = Register.Index
1242    */
1243   index.i[0] =
1244   index.i[1] =
1245   index.i[2] =
1246   index.i[3] = reg->Register.Index;
1247
1248   /* There is an extra source register that indirectly subscripts
1249    * a register file. The direct index now becomes an offset
1250    * that is being added to the indirect register.
1251    *
1252    *    file[ind[2].x+1],
1253    *    where:
1254    *       ind = Indirect.File
1255    *       [2] = Indirect.Index
1256    *       .x = Indirect.SwizzleX
1257    */
1258   if (reg->Register.Indirect) {
1259      union tgsi_exec_channel index2;
1260      union tgsi_exec_channel indir_index;
1261      const uint execmask = mach->ExecMask;
1262      uint i;
1263
1264      /* which address register (always zero now) */
1265      index2.i[0] =
1266      index2.i[1] =
1267      index2.i[2] =
1268      index2.i[3] = reg->Indirect.Index;
1269
1270      /* get current value of address register[swizzle] */
1271      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1272      fetch_src_file_channel(
1273         mach,
1274         reg->Indirect.File,
1275         swizzle,
1276         &index2,
1277         &indir_index );
1278
1279      /* add value of address register to the offset */
1280      index.i[0] += (int) indir_index.f[0];
1281      index.i[1] += (int) indir_index.f[1];
1282      index.i[2] += (int) indir_index.f[2];
1283      index.i[3] += (int) indir_index.f[3];
1284
1285      /* for disabled execution channels, zero-out the index to
1286       * avoid using a potential garbage value.
1287       */
1288      for (i = 0; i < QUAD_SIZE; i++) {
1289         if ((execmask & (1 << i)) == 0)
1290            index.i[i] = 0;
1291      }
1292   }
1293
1294   /* There is an extra source register that is a second
1295    * subscript to a register file. Effectively it means that
1296    * the register file is actually a 2D array of registers.
1297    *
1298    *    file[1][3] == file[1*sizeof(file[1])+3],
1299    *    where:
1300    *       [3] = Dimension.Index
1301    */
1302   if (reg->Register.Dimension) {
1303      /* The size of the first-order array depends on the register file type.
1304       * We need to multiply the index to the first array to get an effective,
1305       * "flat" index that points to the beginning of the second-order array.
1306       */
1307      switch (reg->Register.File) {
1308      case TGSI_FILE_INPUT:
1309      case TGSI_FILE_SYSTEM_VALUE:
1310         index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1311         index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1312         index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1313         index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1314         break;
1315      case TGSI_FILE_CONSTANT:
1316         index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
1317         index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
1318         index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
1319         index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
1320         break;
1321      default:
1322         assert( 0 );
1323      }
1324
1325      index.i[0] += reg->Dimension.Index;
1326      index.i[1] += reg->Dimension.Index;
1327      index.i[2] += reg->Dimension.Index;
1328      index.i[3] += reg->Dimension.Index;
1329
1330      /* Again, the second subscript index can be addressed indirectly
1331       * identically to the first one.
1332       * Nothing stops us from indirectly addressing the indirect register,
1333       * but there is no need for that, so we won't exercise it.
1334       *
1335       *    file[1][ind[4].y+3],
1336       *    where:
1337       *       ind = DimIndirect.File
1338       *       [4] = DimIndirect.Index
1339       *       .y = DimIndirect.SwizzleX
1340       */
1341      if (reg->Dimension.Indirect) {
1342         union tgsi_exec_channel index2;
1343         union tgsi_exec_channel indir_index;
1344         const uint execmask = mach->ExecMask;
1345         uint i;
1346
1347         index2.i[0] =
1348         index2.i[1] =
1349         index2.i[2] =
1350         index2.i[3] = reg->DimIndirect.Index;
1351
1352         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1353         fetch_src_file_channel(
1354            mach,
1355            reg->DimIndirect.File,
1356            swizzle,
1357            &index2,
1358            &indir_index );
1359
1360         index.i[0] += (int) indir_index.f[0];
1361         index.i[1] += (int) indir_index.f[1];
1362         index.i[2] += (int) indir_index.f[2];
1363         index.i[3] += (int) indir_index.f[3];
1364
1365         /* for disabled execution channels, zero-out the index to
1366          * avoid using a potential garbage value.
1367          */
1368         for (i = 0; i < QUAD_SIZE; i++) {
1369            if ((execmask & (1 << i)) == 0)
1370               index.i[i] = 0;
1371         }
1372      }
1373
1374      /* If by any chance there was a need for a 3D array of register
1375       * files, we would have to check whether Dimension is followed
1376       * by a dimension register and continue the saga.
1377       */
1378   }
1379
1380   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1381   fetch_src_file_channel(
1382      mach,
1383      reg->Register.File,
1384      swizzle,
1385      &index,
1386      chan );
1387
1388   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
1389   case TGSI_UTIL_SIGN_CLEAR:
1390      micro_abs( chan, chan );
1391      break;
1392
1393   case TGSI_UTIL_SIGN_SET:
1394      micro_abs( chan, chan );
1395      micro_neg( chan, chan );
1396      break;
1397
1398   case TGSI_UTIL_SIGN_TOGGLE:
1399      micro_neg( chan, chan );
1400      break;
1401
1402   case TGSI_UTIL_SIGN_KEEP:
1403      break;
1404   }
1405}
1406
1407static void
1408store_dest(
1409   struct tgsi_exec_machine *mach,
1410   const union tgsi_exec_channel *chan,
1411   const struct tgsi_full_dst_register *reg,
1412   const struct tgsi_full_instruction *inst,
1413   uint chan_index )
1414{
1415   uint i;
1416   union tgsi_exec_channel null;
1417   union tgsi_exec_channel *dst;
1418   uint execmask = mach->ExecMask;
1419   int offset = 0;  /* indirection offset */
1420   int index;
1421
1422#ifdef DEBUG
1423   check_inf_or_nan(chan);
1424#endif
1425
1426   /* There is an extra source register that indirectly subscripts
1427    * a register file. The direct index now becomes an offset
1428    * that is being added to the indirect register.
1429    *
1430    *    file[ind[2].x+1],
1431    *    where:
1432    *       ind = Indirect.File
1433    *       [2] = Indirect.Index
1434    *       .x = Indirect.SwizzleX
1435    */
1436   if (reg->Register.Indirect) {
1437      union tgsi_exec_channel index;
1438      union tgsi_exec_channel indir_index;
1439      uint swizzle;
1440
1441      /* which address register (always zero for now) */
1442      index.i[0] =
1443      index.i[1] =
1444      index.i[2] =
1445      index.i[3] = reg->Indirect.Index;
1446
1447      /* get current value of address register[swizzle] */
1448      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1449
1450      /* fetch values from the address/indirection register */
1451      fetch_src_file_channel(
1452         mach,
1453         reg->Indirect.File,
1454         swizzle,
1455         &index,
1456         &indir_index );
1457
1458      /* save indirection offset */
1459      offset = (int) indir_index.f[0];
1460   }
1461
1462   switch (reg->Register.File) {
1463   case TGSI_FILE_NULL:
1464      dst = &null;
1465      break;
1466
1467   case TGSI_FILE_OUTPUT:
1468      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1469         + reg->Register.Index;
1470      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1471      break;
1472
1473   case TGSI_FILE_TEMPORARY:
1474      index = reg->Register.Index;
1475      assert( index < TGSI_EXEC_NUM_TEMPS );
1476      dst = &mach->Temps[offset + index].xyzw[chan_index];
1477      break;
1478
1479   case TGSI_FILE_ADDRESS:
1480      index = reg->Register.Index;
1481      dst = &mach->Addrs[index].xyzw[chan_index];
1482      break;
1483
1484   case TGSI_FILE_LOOP:
1485      assert(reg->Register.Index == 0);
1486      assert(mach->LoopCounterStackTop > 0);
1487      assert(chan_index == CHAN_X);
1488      dst = &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[chan_index];
1489      break;
1490
1491   case TGSI_FILE_PREDICATE:
1492      index = reg->Register.Index;
1493      assert(index < TGSI_EXEC_NUM_PREDS);
1494      dst = &mach->Predicates[index].xyzw[chan_index];
1495      break;
1496
1497   default:
1498      assert( 0 );
1499      return;
1500   }
1501
1502   if (inst->Instruction.Predicate) {
1503      uint swizzle;
1504      union tgsi_exec_channel *pred;
1505
1506      switch (chan_index) {
1507      case CHAN_X:
1508         swizzle = inst->Predicate.SwizzleX;
1509         break;
1510      case CHAN_Y:
1511         swizzle = inst->Predicate.SwizzleY;
1512         break;
1513      case CHAN_Z:
1514         swizzle = inst->Predicate.SwizzleZ;
1515         break;
1516      case CHAN_W:
1517         swizzle = inst->Predicate.SwizzleW;
1518         break;
1519      default:
1520         assert(0);
1521         return;
1522      }
1523
1524      assert(inst->Predicate.Index == 0);
1525
1526      pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1527
1528      if (inst->Predicate.Negate) {
1529         for (i = 0; i < QUAD_SIZE; i++) {
1530            if (pred->u[i]) {
1531               execmask &= ~(1 << i);
1532            }
1533         }
1534      } else {
1535         for (i = 0; i < QUAD_SIZE; i++) {
1536            if (!pred->u[i]) {
1537               execmask &= ~(1 << i);
1538            }
1539         }
1540      }
1541   }
1542
1543   switch (inst->Instruction.Saturate) {
1544   case TGSI_SAT_NONE:
1545      for (i = 0; i < QUAD_SIZE; i++)
1546         if (execmask & (1 << i))
1547            dst->i[i] = chan->i[i];
1548      break;
1549
1550   case TGSI_SAT_ZERO_ONE:
1551      for (i = 0; i < QUAD_SIZE; i++)
1552         if (execmask & (1 << i)) {
1553            if (chan->f[i] < 0.0f)
1554               dst->f[i] = 0.0f;
1555            else if (chan->f[i] > 1.0f)
1556               dst->f[i] = 1.0f;
1557            else
1558               dst->i[i] = chan->i[i];
1559         }
1560      break;
1561
1562   case TGSI_SAT_MINUS_PLUS_ONE:
1563      for (i = 0; i < QUAD_SIZE; i++)
1564         if (execmask & (1 << i)) {
1565            if (chan->f[i] < -1.0f)
1566               dst->f[i] = -1.0f;
1567            else if (chan->f[i] > 1.0f)
1568               dst->f[i] = 1.0f;
1569            else
1570               dst->i[i] = chan->i[i];
1571         }
1572      break;
1573
1574   default:
1575      assert( 0 );
1576   }
1577}
1578
1579#define FETCH(VAL,INDEX,CHAN)\
1580    fetch_source (mach, VAL, &inst->Src[INDEX], CHAN)
1581
1582#define STORE(VAL,INDEX,CHAN)\
1583    store_dest (mach, VAL, &inst->Dst[INDEX], inst, CHAN )
1584
1585
1586/**
1587 * Execute ARB-style KIL which is predicated by a src register.
1588 * Kill fragment if any of the four values is less than zero.
1589 */
1590static void
1591exec_kil(struct tgsi_exec_machine *mach,
1592         const struct tgsi_full_instruction *inst)
1593{
1594   uint uniquemask;
1595   uint chan_index;
1596   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1597   union tgsi_exec_channel r[1];
1598
1599   /* This mask stores component bits that were already tested. */
1600   uniquemask = 0;
1601
1602   for (chan_index = 0; chan_index < 4; chan_index++)
1603   {
1604      uint swizzle;
1605      uint i;
1606
1607      /* unswizzle channel */
1608      swizzle = tgsi_util_get_full_src_register_swizzle (
1609                        &inst->Src[0],
1610                        chan_index);
1611
1612      /* check if the component has not been already tested */
1613      if (uniquemask & (1 << swizzle))
1614         continue;
1615      uniquemask |= 1 << swizzle;
1616
1617      FETCH(&r[0], 0, chan_index);
1618      for (i = 0; i < 4; i++)
1619         if (r[0].f[i] < 0.0f)
1620            kilmask |= 1 << i;
1621   }
1622
1623   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1624}
1625
1626/**
1627 * Execute NVIDIA-style KIL which is predicated by a condition code.
1628 * Kill fragment if the condition code is TRUE.
1629 */
1630static void
1631exec_kilp(struct tgsi_exec_machine *mach,
1632          const struct tgsi_full_instruction *inst)
1633{
1634   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1635
1636   /* "unconditional" kil */
1637   kilmask = mach->ExecMask;
1638   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1639}
1640
1641
1642/*
1643 * Fetch a four texture samples using STR texture coordinates.
1644 */
1645static void
1646fetch_texel( struct tgsi_sampler *sampler,
1647             const union tgsi_exec_channel *s,
1648             const union tgsi_exec_channel *t,
1649             const union tgsi_exec_channel *p,
1650             float lodbias,  /* XXX should be float[4] */
1651             union tgsi_exec_channel *r,
1652             union tgsi_exec_channel *g,
1653             union tgsi_exec_channel *b,
1654             union tgsi_exec_channel *a )
1655{
1656   uint j;
1657   float rgba[NUM_CHANNELS][QUAD_SIZE];
1658
1659   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
1660
1661   for (j = 0; j < 4; j++) {
1662      r->f[j] = rgba[0][j];
1663      g->f[j] = rgba[1][j];
1664      b->f[j] = rgba[2][j];
1665      a->f[j] = rgba[3][j];
1666   }
1667}
1668
1669
1670static void
1671exec_tex(struct tgsi_exec_machine *mach,
1672         const struct tgsi_full_instruction *inst,
1673         boolean biasLod,
1674         boolean projected)
1675{
1676   const uint unit = inst->Src[1].Register.Index;
1677   union tgsi_exec_channel r[4];
1678   uint chan_index;
1679   float lodBias;
1680
1681   /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
1682
1683   switch (inst->Texture.Texture) {
1684   case TGSI_TEXTURE_1D:
1685   case TGSI_TEXTURE_SHADOW1D:
1686
1687      FETCH(&r[0], 0, CHAN_X);
1688
1689      if (projected) {
1690         FETCH(&r[1], 0, CHAN_W);
1691         micro_div( &r[0], &r[0], &r[1] );
1692      }
1693
1694      if (biasLod) {
1695         FETCH(&r[1], 0, CHAN_W);
1696         lodBias = r[2].f[0];
1697      }
1698      else
1699         lodBias = 0.0;
1700
1701      fetch_texel(mach->Samplers[unit],
1702                  &r[0], &ZeroVec, &ZeroVec, lodBias,  /* S, T, P, BIAS */
1703                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
1704      break;
1705
1706   case TGSI_TEXTURE_2D:
1707   case TGSI_TEXTURE_RECT:
1708   case TGSI_TEXTURE_SHADOW2D:
1709   case TGSI_TEXTURE_SHADOWRECT:
1710
1711      FETCH(&r[0], 0, CHAN_X);
1712      FETCH(&r[1], 0, CHAN_Y);
1713      FETCH(&r[2], 0, CHAN_Z);
1714
1715      if (projected) {
1716         FETCH(&r[3], 0, CHAN_W);
1717         micro_div( &r[0], &r[0], &r[3] );
1718         micro_div( &r[1], &r[1], &r[3] );
1719         micro_div( &r[2], &r[2], &r[3] );
1720      }
1721
1722      if (biasLod) {
1723         FETCH(&r[3], 0, CHAN_W);
1724         lodBias = r[3].f[0];
1725      }
1726      else
1727         lodBias = 0.0;
1728
1729      fetch_texel(mach->Samplers[unit],
1730                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
1731                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1732      break;
1733
1734   case TGSI_TEXTURE_3D:
1735   case TGSI_TEXTURE_CUBE:
1736
1737      FETCH(&r[0], 0, CHAN_X);
1738      FETCH(&r[1], 0, CHAN_Y);
1739      FETCH(&r[2], 0, CHAN_Z);
1740
1741      if (projected) {
1742         FETCH(&r[3], 0, CHAN_W);
1743         micro_div( &r[0], &r[0], &r[3] );
1744         micro_div( &r[1], &r[1], &r[3] );
1745         micro_div( &r[2], &r[2], &r[3] );
1746      }
1747
1748      if (biasLod) {
1749         FETCH(&r[3], 0, CHAN_W);
1750         lodBias = r[3].f[0];
1751      }
1752      else
1753         lodBias = 0.0;
1754
1755      fetch_texel(mach->Samplers[unit],
1756                  &r[0], &r[1], &r[2], lodBias,
1757                  &r[0], &r[1], &r[2], &r[3]);
1758      break;
1759
1760   default:
1761      assert (0);
1762   }
1763
1764   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1765      STORE( &r[chan_index], 0, chan_index );
1766   }
1767}
1768
1769static void
1770exec_txd(struct tgsi_exec_machine *mach,
1771         const struct tgsi_full_instruction *inst)
1772{
1773   const uint unit = inst->Src[3].Register.Index;
1774   union tgsi_exec_channel r[4];
1775   uint chan_index;
1776
1777   /*
1778    * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1779    */
1780
1781   switch (inst->Texture.Texture) {
1782   case TGSI_TEXTURE_1D:
1783   case TGSI_TEXTURE_SHADOW1D:
1784
1785      FETCH(&r[0], 0, CHAN_X);
1786
1787      fetch_texel(mach->Samplers[unit],
1788                  &r[0], &ZeroVec, &ZeroVec, 0.0f,  /* S, T, P, BIAS */
1789                  &r[0], &r[1], &r[2], &r[3]);      /* R, G, B, A */
1790      break;
1791
1792   case TGSI_TEXTURE_2D:
1793   case TGSI_TEXTURE_RECT:
1794   case TGSI_TEXTURE_SHADOW2D:
1795   case TGSI_TEXTURE_SHADOWRECT:
1796
1797      FETCH(&r[0], 0, CHAN_X);
1798      FETCH(&r[1], 0, CHAN_Y);
1799      FETCH(&r[2], 0, CHAN_Z);
1800
1801      fetch_texel(mach->Samplers[unit],
1802                  &r[0], &r[1], &r[2], 0.0f,    /* inputs */
1803                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1804      break;
1805
1806   case TGSI_TEXTURE_3D:
1807   case TGSI_TEXTURE_CUBE:
1808
1809      FETCH(&r[0], 0, CHAN_X);
1810      FETCH(&r[1], 0, CHAN_Y);
1811      FETCH(&r[2], 0, CHAN_Z);
1812
1813      fetch_texel(mach->Samplers[unit],
1814                  &r[0], &r[1], &r[2], 0.0f,
1815                  &r[0], &r[1], &r[2], &r[3]);
1816      break;
1817
1818   default:
1819      assert(0);
1820   }
1821
1822   FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1823      STORE(&r[chan_index], 0, chan_index);
1824   }
1825}
1826
1827
1828/**
1829 * Evaluate a constant-valued coefficient at the position of the
1830 * current quad.
1831 */
1832static void
1833eval_constant_coef(
1834   struct tgsi_exec_machine *mach,
1835   unsigned attrib,
1836   unsigned chan )
1837{
1838   unsigned i;
1839
1840   for( i = 0; i < QUAD_SIZE; i++ ) {
1841      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1842   }
1843}
1844
1845/**
1846 * Evaluate a linear-valued coefficient at the position of the
1847 * current quad.
1848 */
1849static void
1850eval_linear_coef(
1851   struct tgsi_exec_machine *mach,
1852   unsigned attrib,
1853   unsigned chan )
1854{
1855   const float x = mach->QuadPos.xyzw[0].f[0];
1856   const float y = mach->QuadPos.xyzw[1].f[0];
1857   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1858   const float dady = mach->InterpCoefs[attrib].dady[chan];
1859   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1860   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1861   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1862   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1863   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1864}
1865
1866/**
1867 * Evaluate a perspective-valued coefficient at the position of the
1868 * current quad.
1869 */
1870static void
1871eval_perspective_coef(
1872   struct tgsi_exec_machine *mach,
1873   unsigned attrib,
1874   unsigned chan )
1875{
1876   const float x = mach->QuadPos.xyzw[0].f[0];
1877   const float y = mach->QuadPos.xyzw[1].f[0];
1878   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1879   const float dady = mach->InterpCoefs[attrib].dady[chan];
1880   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1881   const float *w = mach->QuadPos.xyzw[3].f;
1882   /* divide by W here */
1883   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1884   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1885   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1886   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1887}
1888
1889
1890typedef void (* eval_coef_func)(
1891   struct tgsi_exec_machine *mach,
1892   unsigned attrib,
1893   unsigned chan );
1894
1895static void
1896exec_declaration(struct tgsi_exec_machine *mach,
1897                 const struct tgsi_full_declaration *decl)
1898{
1899   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
1900      if (decl->Declaration.File == TGSI_FILE_INPUT ||
1901          decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1902         uint first, last, mask;
1903
1904         first = decl->Range.First;
1905         last = decl->Range.Last;
1906         mask = decl->Declaration.UsageMask;
1907
1908         if (decl->Semantic.Name == TGSI_SEMANTIC_POSITION) {
1909            assert(decl->Semantic.Index == 0);
1910            assert(first == last);
1911            assert(mask = TGSI_WRITEMASK_XYZW);
1912
1913            mach->Inputs[first] = mach->QuadPos;
1914         } else if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
1915            uint i;
1916
1917            assert(decl->Semantic.Index == 0);
1918            assert(first == last);
1919
1920            for (i = 0; i < QUAD_SIZE; i++) {
1921               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
1922            }
1923         } else {
1924            eval_coef_func eval;
1925            uint i, j;
1926
1927            switch (decl->Declaration.Interpolate) {
1928            case TGSI_INTERPOLATE_CONSTANT:
1929               eval = eval_constant_coef;
1930               break;
1931
1932            case TGSI_INTERPOLATE_LINEAR:
1933               eval = eval_linear_coef;
1934               break;
1935
1936            case TGSI_INTERPOLATE_PERSPECTIVE:
1937               eval = eval_perspective_coef;
1938               break;
1939
1940            default:
1941               assert(0);
1942               return;
1943            }
1944
1945            for (j = 0; j < NUM_CHANNELS; j++) {
1946               if (mask & (1 << j)) {
1947                  for (i = first; i <= last; i++) {
1948                     eval(mach, i, j);
1949                  }
1950               }
1951            }
1952         }
1953      }
1954   }
1955}
1956
1957static void
1958exec_instruction(
1959   struct tgsi_exec_machine *mach,
1960   const struct tgsi_full_instruction *inst,
1961   int *pc )
1962{
1963   uint chan_index;
1964   union tgsi_exec_channel r[10];
1965   union tgsi_exec_channel d[8];
1966
1967   (*pc)++;
1968
1969   switch (inst->Instruction.Opcode) {
1970   case TGSI_OPCODE_ARL:
1971   case TGSI_OPCODE_FLR:
1972      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1973         FETCH( &r[0], 0, chan_index );
1974         micro_flr(&d[chan_index], &r[0]);
1975      }
1976      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1977         STORE(&d[chan_index], 0, chan_index);
1978      }
1979      break;
1980
1981   case TGSI_OPCODE_MOV:
1982      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1983         FETCH(&d[chan_index], 0, chan_index);
1984      }
1985      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1986         STORE(&d[chan_index], 0, chan_index);
1987      }
1988      break;
1989
1990   case TGSI_OPCODE_LIT:
1991      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1992         FETCH( &r[0], 0, CHAN_X );
1993         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1994            micro_max(&d[CHAN_Y], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
1995         }
1996
1997         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1998            FETCH( &r[1], 0, CHAN_Y );
1999            micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2000
2001            FETCH( &r[2], 0, CHAN_W );
2002            micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
2003            micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
2004            micro_pow( &r[1], &r[1], &r[2] );
2005            micro_lt(&d[CHAN_Z], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2006         }
2007
2008         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2009            STORE(&d[CHAN_Y], 0, CHAN_Y);
2010         }
2011         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2012            STORE(&d[CHAN_Z], 0, CHAN_Z);
2013         }
2014      }
2015      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2016         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2017      }
2018      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2019         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2020      }
2021      break;
2022
2023   case TGSI_OPCODE_RCP:
2024   /* TGSI_OPCODE_RECIP */
2025      FETCH( &r[0], 0, CHAN_X );
2026      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
2027      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2028         STORE( &r[0], 0, chan_index );
2029      }
2030      break;
2031
2032   case TGSI_OPCODE_RSQ:
2033   /* TGSI_OPCODE_RECIPSQRT */
2034      FETCH( &r[0], 0, CHAN_X );
2035      micro_abs( &r[0], &r[0] );
2036      micro_sqrt( &r[0], &r[0] );
2037      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
2038      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2039         STORE( &r[0], 0, chan_index );
2040      }
2041      break;
2042
2043   case TGSI_OPCODE_EXP:
2044      FETCH( &r[0], 0, CHAN_X );
2045      micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
2046      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2047         micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
2048         STORE( &r[2], 0, CHAN_X );        /* store r2 */
2049      }
2050      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2051         micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
2052         STORE( &r[2], 0, CHAN_Y );        /* store r2 */
2053      }
2054      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2055         micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
2056         STORE( &r[2], 0, CHAN_Z );        /* store r2 */
2057      }
2058      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2059         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2060      }
2061      break;
2062
2063   case TGSI_OPCODE_LOG:
2064      FETCH( &r[0], 0, CHAN_X );
2065      micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
2066      micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
2067      micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
2068      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2069         STORE( &r[0], 0, CHAN_X );
2070      }
2071      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2072         micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
2073         micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
2074         STORE( &r[0], 0, CHAN_Y );
2075      }
2076      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2077         STORE( &r[1], 0, CHAN_Z );
2078      }
2079      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2080         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2081      }
2082      break;
2083
2084   case TGSI_OPCODE_MUL:
2085      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2086         FETCH(&r[0], 0, chan_index);
2087         FETCH(&r[1], 1, chan_index);
2088         micro_mul(&d[chan_index], &r[0], &r[1]);
2089      }
2090      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2091         STORE(&d[chan_index], 0, chan_index);
2092      }
2093      break;
2094
2095   case TGSI_OPCODE_ADD:
2096      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2097         FETCH( &r[0], 0, chan_index );
2098         FETCH( &r[1], 1, chan_index );
2099         micro_add(&d[chan_index], &r[0], &r[1]);
2100      }
2101      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2102         STORE(&d[chan_index], 0, chan_index);
2103      }
2104      break;
2105
2106   case TGSI_OPCODE_DP3:
2107   /* TGSI_OPCODE_DOT3 */
2108      FETCH( &r[0], 0, CHAN_X );
2109      FETCH( &r[1], 1, CHAN_X );
2110      micro_mul( &r[0], &r[0], &r[1] );
2111
2112      FETCH( &r[1], 0, CHAN_Y );
2113      FETCH( &r[2], 1, CHAN_Y );
2114      micro_mul( &r[1], &r[1], &r[2] );
2115      micro_add( &r[0], &r[0], &r[1] );
2116
2117      FETCH( &r[1], 0, CHAN_Z );
2118      FETCH( &r[2], 1, CHAN_Z );
2119      micro_mul( &r[1], &r[1], &r[2] );
2120      micro_add( &r[0], &r[0], &r[1] );
2121
2122      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2123         STORE( &r[0], 0, chan_index );
2124      }
2125      break;
2126
2127    case TGSI_OPCODE_DP4:
2128    /* TGSI_OPCODE_DOT4 */
2129       FETCH(&r[0], 0, CHAN_X);
2130       FETCH(&r[1], 1, CHAN_X);
2131
2132       micro_mul( &r[0], &r[0], &r[1] );
2133
2134       FETCH(&r[1], 0, CHAN_Y);
2135       FETCH(&r[2], 1, CHAN_Y);
2136
2137       micro_mul( &r[1], &r[1], &r[2] );
2138       micro_add( &r[0], &r[0], &r[1] );
2139
2140       FETCH(&r[1], 0, CHAN_Z);
2141       FETCH(&r[2], 1, CHAN_Z);
2142
2143       micro_mul( &r[1], &r[1], &r[2] );
2144       micro_add( &r[0], &r[0], &r[1] );
2145
2146       FETCH(&r[1], 0, CHAN_W);
2147       FETCH(&r[2], 1, CHAN_W);
2148
2149       micro_mul( &r[1], &r[1], &r[2] );
2150       micro_add( &r[0], &r[0], &r[1] );
2151
2152      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2153         STORE( &r[0], 0, chan_index );
2154      }
2155      break;
2156
2157   case TGSI_OPCODE_DST:
2158      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2159         FETCH( &r[0], 0, CHAN_Y );
2160         FETCH( &r[1], 1, CHAN_Y);
2161         micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2162      }
2163      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2164         FETCH(&d[CHAN_Z], 0, CHAN_Z);
2165      }
2166      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2167         FETCH(&d[CHAN_W], 1, CHAN_W);
2168      }
2169
2170      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2171         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X);
2172      }
2173      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2174         STORE(&d[CHAN_Y], 0, CHAN_Y);
2175      }
2176      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2177         STORE(&d[CHAN_Z], 0, CHAN_Z);
2178      }
2179      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2180         STORE(&d[CHAN_W], 0, CHAN_W);
2181      }
2182      break;
2183
2184   case TGSI_OPCODE_MIN:
2185      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2186         FETCH(&r[0], 0, chan_index);
2187         FETCH(&r[1], 1, chan_index);
2188
2189         /* XXX use micro_min()?? */
2190         micro_lt(&d[chan_index], &r[0], &r[1], &r[0], &r[1]);
2191      }
2192      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2193         STORE(&d[chan_index], 0, chan_index);
2194      }
2195      break;
2196
2197   case TGSI_OPCODE_MAX:
2198      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2199         FETCH(&r[0], 0, chan_index);
2200         FETCH(&r[1], 1, chan_index);
2201
2202         /* XXX use micro_max()?? */
2203         micro_lt(&d[chan_index], &r[0], &r[1], &r[1], &r[0] );
2204      }
2205      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2206         STORE(&d[chan_index], 0, chan_index);
2207      }
2208      break;
2209
2210   case TGSI_OPCODE_SLT:
2211   /* TGSI_OPCODE_SETLT */
2212      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2213         FETCH( &r[0], 0, chan_index );
2214         FETCH( &r[1], 1, chan_index );
2215         micro_lt(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2216      }
2217      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2218         STORE(&d[chan_index], 0, chan_index);
2219      }
2220      break;
2221
2222   case TGSI_OPCODE_SGE:
2223   /* TGSI_OPCODE_SETGE */
2224      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2225         FETCH( &r[0], 0, chan_index );
2226         FETCH( &r[1], 1, chan_index );
2227         micro_le(&d[chan_index], &r[1], &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2228      }
2229      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2230         STORE(&d[chan_index], 0, chan_index);
2231      }
2232      break;
2233
2234   case TGSI_OPCODE_MAD:
2235   /* TGSI_OPCODE_MADD */
2236      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2237         FETCH( &r[0], 0, chan_index );
2238         FETCH( &r[1], 1, chan_index );
2239         micro_mul( &r[0], &r[0], &r[1] );
2240         FETCH( &r[1], 2, chan_index );
2241         micro_add(&d[chan_index], &r[0], &r[1]);
2242      }
2243      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2244         STORE(&d[chan_index], 0, chan_index);
2245      }
2246      break;
2247
2248   case TGSI_OPCODE_SUB:
2249      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2250         FETCH(&r[0], 0, chan_index);
2251         FETCH(&r[1], 1, chan_index);
2252         micro_sub(&d[chan_index], &r[0], &r[1]);
2253      }
2254      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2255         STORE(&d[chan_index], 0, chan_index);
2256      }
2257      break;
2258
2259   case TGSI_OPCODE_LRP:
2260      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2261         FETCH(&r[0], 0, chan_index);
2262         FETCH(&r[1], 1, chan_index);
2263         FETCH(&r[2], 2, chan_index);
2264         micro_sub( &r[1], &r[1], &r[2] );
2265         micro_mul( &r[0], &r[0], &r[1] );
2266         micro_add(&d[chan_index], &r[0], &r[2]);
2267      }
2268      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2269         STORE(&d[chan_index], 0, chan_index);
2270      }
2271      break;
2272
2273   case TGSI_OPCODE_CND:
2274      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2275         FETCH(&r[0], 0, chan_index);
2276         FETCH(&r[1], 1, chan_index);
2277         FETCH(&r[2], 2, chan_index);
2278         micro_lt(&d[chan_index], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]);
2279      }
2280      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2281         STORE(&d[chan_index], 0, chan_index);
2282      }
2283      break;
2284
2285   case TGSI_OPCODE_DP2A:
2286      FETCH( &r[0], 0, CHAN_X );
2287      FETCH( &r[1], 1, CHAN_X );
2288      micro_mul( &r[0], &r[0], &r[1] );
2289
2290      FETCH( &r[1], 0, CHAN_Y );
2291      FETCH( &r[2], 1, CHAN_Y );
2292      micro_mul( &r[1], &r[1], &r[2] );
2293      micro_add( &r[0], &r[0], &r[1] );
2294
2295      FETCH( &r[2], 2, CHAN_X );
2296      micro_add( &r[0], &r[0], &r[2] );
2297
2298      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2299         STORE( &r[0], 0, chan_index );
2300      }
2301      break;
2302
2303   case TGSI_OPCODE_FRC:
2304      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2305         FETCH( &r[0], 0, chan_index );
2306         micro_frc(&d[chan_index], &r[0]);
2307      }
2308      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2309         STORE(&d[chan_index], 0, chan_index);
2310      }
2311      break;
2312
2313   case TGSI_OPCODE_CLAMP:
2314      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2315         FETCH(&r[0], 0, chan_index);
2316         FETCH(&r[1], 1, chan_index);
2317         micro_max(&r[0], &r[0], &r[1]);
2318         FETCH(&r[1], 2, chan_index);
2319         micro_min(&d[chan_index], &r[0], &r[1]);
2320      }
2321      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2322         STORE(&d[chan_index], 0, chan_index);
2323      }
2324      break;
2325
2326   case TGSI_OPCODE_ROUND:
2327   case TGSI_OPCODE_ARR:
2328      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2329         FETCH( &r[0], 0, chan_index );
2330         micro_rnd(&d[chan_index], &r[0]);
2331      }
2332      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2333         STORE(&d[chan_index], 0, chan_index);
2334      }
2335      break;
2336
2337   case TGSI_OPCODE_EX2:
2338      FETCH(&r[0], 0, CHAN_X);
2339
2340      micro_exp2( &r[0], &r[0] );
2341
2342      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2343         STORE( &r[0], 0, chan_index );
2344      }
2345      break;
2346
2347   case TGSI_OPCODE_LG2:
2348      FETCH( &r[0], 0, CHAN_X );
2349      micro_lg2( &r[0], &r[0] );
2350      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2351         STORE( &r[0], 0, chan_index );
2352      }
2353      break;
2354
2355   case TGSI_OPCODE_POW:
2356      FETCH(&r[0], 0, CHAN_X);
2357      FETCH(&r[1], 1, CHAN_X);
2358
2359      micro_pow( &r[0], &r[0], &r[1] );
2360
2361      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2362         STORE( &r[0], 0, chan_index );
2363      }
2364      break;
2365
2366   case TGSI_OPCODE_XPD:
2367      FETCH(&r[0], 0, CHAN_Y);
2368      FETCH(&r[1], 1, CHAN_Z);
2369
2370      micro_mul( &r[2], &r[0], &r[1] );
2371
2372      FETCH(&r[3], 0, CHAN_Z);
2373      FETCH(&r[4], 1, CHAN_Y);
2374
2375      micro_mul( &r[5], &r[3], &r[4] );
2376      micro_sub(&d[CHAN_X], &r[2], &r[5]);
2377
2378      FETCH(&r[2], 1, CHAN_X);
2379
2380      micro_mul( &r[3], &r[3], &r[2] );
2381
2382      FETCH(&r[5], 0, CHAN_X);
2383
2384      micro_mul( &r[1], &r[1], &r[5] );
2385      micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2386
2387      micro_mul( &r[5], &r[5], &r[4] );
2388      micro_mul( &r[0], &r[0], &r[2] );
2389      micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2390
2391      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2392         STORE(&d[CHAN_X], 0, CHAN_X);
2393      }
2394      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2395         STORE(&d[CHAN_Y], 0, CHAN_Y);
2396      }
2397      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2398         STORE(&d[CHAN_Z], 0, CHAN_Z);
2399      }
2400      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2401         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2402      }
2403      break;
2404
2405    case TGSI_OPCODE_ABS:
2406       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2407          FETCH(&r[0], 0, chan_index);
2408          micro_abs(&d[chan_index], &r[0]);
2409       }
2410       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2411         STORE(&d[chan_index], 0, chan_index);
2412      }
2413       break;
2414
2415   case TGSI_OPCODE_RCC:
2416      FETCH(&r[0], 0, CHAN_X);
2417      micro_div(&r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0]);
2418      micro_float_clamp(&r[0], &r[0]);
2419      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2420         STORE(&r[0], 0, chan_index);
2421      }
2422      break;
2423
2424   case TGSI_OPCODE_DPH:
2425      FETCH(&r[0], 0, CHAN_X);
2426      FETCH(&r[1], 1, CHAN_X);
2427
2428      micro_mul( &r[0], &r[0], &r[1] );
2429
2430      FETCH(&r[1], 0, CHAN_Y);
2431      FETCH(&r[2], 1, CHAN_Y);
2432
2433      micro_mul( &r[1], &r[1], &r[2] );
2434      micro_add( &r[0], &r[0], &r[1] );
2435
2436      FETCH(&r[1], 0, CHAN_Z);
2437      FETCH(&r[2], 1, CHAN_Z);
2438
2439      micro_mul( &r[1], &r[1], &r[2] );
2440      micro_add( &r[0], &r[0], &r[1] );
2441
2442      FETCH(&r[1], 1, CHAN_W);
2443
2444      micro_add( &r[0], &r[0], &r[1] );
2445
2446      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2447         STORE( &r[0], 0, chan_index );
2448      }
2449      break;
2450
2451   case TGSI_OPCODE_COS:
2452      FETCH(&r[0], 0, CHAN_X);
2453
2454      micro_cos( &r[0], &r[0] );
2455
2456      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2457         STORE( &r[0], 0, chan_index );
2458      }
2459      break;
2460
2461   case TGSI_OPCODE_DDX:
2462      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2463         FETCH( &r[0], 0, chan_index );
2464         micro_ddx(&d[chan_index], &r[0]);
2465      }
2466      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2467         STORE(&d[chan_index], 0, chan_index);
2468      }
2469      break;
2470
2471   case TGSI_OPCODE_DDY:
2472      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2473         FETCH( &r[0], 0, chan_index );
2474         micro_ddy(&d[chan_index], &r[0]);
2475      }
2476      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2477         STORE(&d[chan_index], 0, chan_index);
2478      }
2479      break;
2480
2481   case TGSI_OPCODE_KILP:
2482      exec_kilp (mach, inst);
2483      break;
2484
2485   case TGSI_OPCODE_KIL:
2486      exec_kil (mach, inst);
2487      break;
2488
2489   case TGSI_OPCODE_PK2H:
2490      assert (0);
2491      break;
2492
2493   case TGSI_OPCODE_PK2US:
2494      assert (0);
2495      break;
2496
2497   case TGSI_OPCODE_PK4B:
2498      assert (0);
2499      break;
2500
2501   case TGSI_OPCODE_PK4UB:
2502      assert (0);
2503      break;
2504
2505   case TGSI_OPCODE_RFL:
2506      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2507          IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2508          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2509         /* r0 = dp3(src0, src0) */
2510         FETCH(&r[2], 0, CHAN_X);
2511         micro_mul(&r[0], &r[2], &r[2]);
2512         FETCH(&r[4], 0, CHAN_Y);
2513         micro_mul(&r[8], &r[4], &r[4]);
2514         micro_add(&r[0], &r[0], &r[8]);
2515         FETCH(&r[6], 0, CHAN_Z);
2516         micro_mul(&r[8], &r[6], &r[6]);
2517         micro_add(&r[0], &r[0], &r[8]);
2518
2519         /* r1 = dp3(src0, src1) */
2520         FETCH(&r[3], 1, CHAN_X);
2521         micro_mul(&r[1], &r[2], &r[3]);
2522         FETCH(&r[5], 1, CHAN_Y);
2523         micro_mul(&r[8], &r[4], &r[5]);
2524         micro_add(&r[1], &r[1], &r[8]);
2525         FETCH(&r[7], 1, CHAN_Z);
2526         micro_mul(&r[8], &r[6], &r[7]);
2527         micro_add(&r[1], &r[1], &r[8]);
2528
2529         /* r1 = 2 * r1 / r0 */
2530         micro_add(&r[1], &r[1], &r[1]);
2531         micro_div(&r[1], &r[1], &r[0]);
2532
2533         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2534            micro_mul(&r[2], &r[2], &r[1]);
2535            micro_sub(&r[2], &r[2], &r[3]);
2536            STORE(&r[2], 0, CHAN_X);
2537         }
2538         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2539            micro_mul(&r[4], &r[4], &r[1]);
2540            micro_sub(&r[4], &r[4], &r[5]);
2541            STORE(&r[4], 0, CHAN_Y);
2542         }
2543         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2544            micro_mul(&r[6], &r[6], &r[1]);
2545            micro_sub(&r[6], &r[6], &r[7]);
2546            STORE(&r[6], 0, CHAN_Z);
2547         }
2548      }
2549      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2550         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2551      }
2552      break;
2553
2554   case TGSI_OPCODE_SEQ:
2555      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2556         FETCH( &r[0], 0, chan_index );
2557         FETCH( &r[1], 1, chan_index );
2558         micro_eq(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2559      }
2560      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2561         STORE(&d[chan_index], 0, chan_index);
2562      }
2563      break;
2564
2565   case TGSI_OPCODE_SFL:
2566      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2567         STORE(&mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, chan_index);
2568      }
2569      break;
2570
2571   case TGSI_OPCODE_SGT:
2572      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2573         FETCH( &r[0], 0, chan_index );
2574         FETCH( &r[1], 1, chan_index );
2575         micro_le(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
2576      }
2577      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2578         STORE(&d[chan_index], 0, chan_index);
2579      }
2580      break;
2581
2582   case TGSI_OPCODE_SIN:
2583      FETCH( &r[0], 0, CHAN_X );
2584      micro_sin( &r[0], &r[0] );
2585      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2586         STORE( &r[0], 0, chan_index );
2587      }
2588      break;
2589
2590   case TGSI_OPCODE_SLE:
2591      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2592         FETCH( &r[0], 0, chan_index );
2593         FETCH( &r[1], 1, chan_index );
2594         micro_le(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2595      }
2596      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2597         STORE(&d[chan_index], 0, chan_index);
2598      }
2599      break;
2600
2601   case TGSI_OPCODE_SNE:
2602      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2603         FETCH( &r[0], 0, chan_index );
2604         FETCH( &r[1], 1, chan_index );
2605         micro_eq(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
2606      }
2607      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2608         STORE(&d[chan_index], 0, chan_index);
2609      }
2610      break;
2611
2612   case TGSI_OPCODE_STR:
2613      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2614         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, chan_index);
2615      }
2616      break;
2617
2618   case TGSI_OPCODE_TEX:
2619      /* simple texture lookup */
2620      /* src[0] = texcoord */
2621      /* src[1] = sampler unit */
2622      exec_tex(mach, inst, FALSE, FALSE);
2623      break;
2624
2625   case TGSI_OPCODE_TXB:
2626      /* Texture lookup with lod bias */
2627      /* src[0] = texcoord (src[0].w = LOD bias) */
2628      /* src[1] = sampler unit */
2629      exec_tex(mach, inst, TRUE, FALSE);
2630      break;
2631
2632   case TGSI_OPCODE_TXD:
2633      /* Texture lookup with explict partial derivatives */
2634      /* src[0] = texcoord */
2635      /* src[1] = d[strq]/dx */
2636      /* src[2] = d[strq]/dy */
2637      /* src[3] = sampler unit */
2638      exec_txd(mach, inst);
2639      break;
2640
2641   case TGSI_OPCODE_TXL:
2642      /* Texture lookup with explit LOD */
2643      /* src[0] = texcoord (src[0].w = LOD) */
2644      /* src[1] = sampler unit */
2645      exec_tex(mach, inst, TRUE, FALSE);
2646      break;
2647
2648   case TGSI_OPCODE_TXP:
2649      /* Texture lookup with projection */
2650      /* src[0] = texcoord (src[0].w = projection) */
2651      /* src[1] = sampler unit */
2652      exec_tex(mach, inst, FALSE, TRUE);
2653      break;
2654
2655   case TGSI_OPCODE_UP2H:
2656      assert (0);
2657      break;
2658
2659   case TGSI_OPCODE_UP2US:
2660      assert (0);
2661      break;
2662
2663   case TGSI_OPCODE_UP4B:
2664      assert (0);
2665      break;
2666
2667   case TGSI_OPCODE_UP4UB:
2668      assert (0);
2669      break;
2670
2671   case TGSI_OPCODE_X2D:
2672      FETCH(&r[0], 1, CHAN_X);
2673      FETCH(&r[1], 1, CHAN_Y);
2674      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2675          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2676         FETCH(&r[2], 2, CHAN_X);
2677         micro_mul(&r[2], &r[2], &r[0]);
2678         FETCH(&r[3], 2, CHAN_Y);
2679         micro_mul(&r[3], &r[3], &r[1]);
2680         micro_add(&r[2], &r[2], &r[3]);
2681         FETCH(&r[3], 0, CHAN_X);
2682         micro_add(&d[CHAN_X], &r[2], &r[3]);
2683
2684      }
2685      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2686          IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2687         FETCH(&r[2], 2, CHAN_Z);
2688         micro_mul(&r[2], &r[2], &r[0]);
2689         FETCH(&r[3], 2, CHAN_W);
2690         micro_mul(&r[3], &r[3], &r[1]);
2691         micro_add(&r[2], &r[2], &r[3]);
2692         FETCH(&r[3], 0, CHAN_Y);
2693         micro_add(&d[CHAN_Y], &r[2], &r[3]);
2694
2695      }
2696      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2697         STORE(&d[CHAN_X], 0, CHAN_X);
2698      }
2699      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2700         STORE(&d[CHAN_Y], 0, CHAN_Y);
2701      }
2702      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2703         STORE(&d[CHAN_X], 0, CHAN_Z);
2704      }
2705      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2706         STORE(&d[CHAN_Y], 0, CHAN_W);
2707      }
2708      break;
2709
2710   case TGSI_OPCODE_ARA:
2711      assert (0);
2712      break;
2713
2714   case TGSI_OPCODE_BRA:
2715      assert (0);
2716      break;
2717
2718   case TGSI_OPCODE_CAL:
2719      /* skip the call if no execution channels are enabled */
2720      if (mach->ExecMask) {
2721         /* do the call */
2722
2723         /* First, record the depths of the execution stacks.
2724          * This is important for deeply nested/looped return statements.
2725          * We have to unwind the stacks by the correct amount.  For a
2726          * real code generator, we could determine the number of entries
2727          * to pop off each stack with simple static analysis and avoid
2728          * implementing this data structure at run time.
2729          */
2730         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
2731         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
2732         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
2733         /* note that PC was already incremented above */
2734         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
2735
2736         mach->CallStackTop++;
2737
2738         /* Second, push the Cond, Loop, Cont, Func stacks */
2739         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2740         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2741         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2742         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2743         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2744         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2745         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
2746         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
2747
2748         /* Finally, jump to the subroutine */
2749         *pc = inst->Label.Label;
2750      }
2751      break;
2752
2753   case TGSI_OPCODE_RET:
2754      mach->FuncMask &= ~mach->ExecMask;
2755      UPDATE_EXEC_MASK(mach);
2756
2757      if (mach->FuncMask == 0x0) {
2758         /* really return now (otherwise, keep executing */
2759
2760         if (mach->CallStackTop == 0) {
2761            /* returning from main() */
2762            *pc = -1;
2763            return;
2764         }
2765
2766         assert(mach->CallStackTop > 0);
2767         mach->CallStackTop--;
2768
2769         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
2770         mach->CondMask = mach->CondStack[mach->CondStackTop];
2771
2772         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
2773         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
2774
2775         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
2776         mach->ContMask = mach->ContStack[mach->ContStackTop];
2777
2778         assert(mach->FuncStackTop > 0);
2779         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
2780
2781         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
2782
2783         UPDATE_EXEC_MASK(mach);
2784      }
2785      break;
2786
2787   case TGSI_OPCODE_SSG:
2788   /* TGSI_OPCODE_SGN */
2789      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2790         FETCH( &r[0], 0, chan_index );
2791         micro_sgn(&d[chan_index], &r[0]);
2792      }
2793      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2794         STORE(&d[chan_index], 0, chan_index);
2795      }
2796      break;
2797
2798   case TGSI_OPCODE_CMP:
2799      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2800         FETCH(&r[0], 0, chan_index);
2801         FETCH(&r[1], 1, chan_index);
2802         FETCH(&r[2], 2, chan_index);
2803         micro_lt(&d[chan_index], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2]);
2804      }
2805      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2806         STORE(&d[chan_index], 0, chan_index);
2807      }
2808      break;
2809
2810   case TGSI_OPCODE_SCS:
2811      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2812         FETCH( &r[0], 0, CHAN_X );
2813         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2814            micro_cos(&r[1], &r[0]);
2815            STORE(&r[1], 0, CHAN_X);
2816         }
2817         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2818            micro_sin(&r[1], &r[0]);
2819            STORE(&r[1], 0, CHAN_Y);
2820         }
2821      }
2822      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2823         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
2824      }
2825      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
2826         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2827      }
2828      break;
2829
2830   case TGSI_OPCODE_NRM:
2831      /* 3-component vector normalize */
2832      if(IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2833         IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2834         IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2835         /* r3 = sqrt(dp3(src0, src0)) */
2836         FETCH(&r[0], 0, CHAN_X);
2837         micro_mul(&r[3], &r[0], &r[0]);
2838         FETCH(&r[1], 0, CHAN_Y);
2839         micro_mul(&r[4], &r[1], &r[1]);
2840         micro_add(&r[3], &r[3], &r[4]);
2841         FETCH(&r[2], 0, CHAN_Z);
2842         micro_mul(&r[4], &r[2], &r[2]);
2843         micro_add(&r[3], &r[3], &r[4]);
2844         micro_sqrt(&r[3], &r[3]);
2845
2846         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2847            micro_div(&r[0], &r[0], &r[3]);
2848            STORE(&r[0], 0, CHAN_X);
2849         }
2850         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2851            micro_div(&r[1], &r[1], &r[3]);
2852            STORE(&r[1], 0, CHAN_Y);
2853         }
2854         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2855            micro_div(&r[2], &r[2], &r[3]);
2856            STORE(&r[2], 0, CHAN_Z);
2857         }
2858      }
2859      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2860         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2861      }
2862      break;
2863
2864   case TGSI_OPCODE_NRM4:
2865      /* 4-component vector normalize */
2866      {
2867         union tgsi_exec_channel tmp, dot;
2868
2869         /* tmp = dp4(src0, src0): */
2870         FETCH( &r[0], 0, CHAN_X );
2871         micro_mul( &tmp, &r[0], &r[0] );
2872
2873         FETCH( &r[1], 0, CHAN_Y );
2874         micro_mul( &dot, &r[1], &r[1] );
2875         micro_add( &tmp, &tmp, &dot );
2876
2877         FETCH( &r[2], 0, CHAN_Z );
2878         micro_mul( &dot, &r[2], &r[2] );
2879         micro_add( &tmp, &tmp, &dot );
2880
2881         FETCH( &r[3], 0, CHAN_W );
2882         micro_mul( &dot, &r[3], &r[3] );
2883         micro_add( &tmp, &tmp, &dot );
2884
2885         /* tmp = 1 / sqrt(tmp) */
2886         micro_sqrt( &tmp, &tmp );
2887         micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
2888
2889         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2890            /* chan = chan * tmp */
2891            micro_mul( &r[chan_index], &tmp, &r[chan_index] );
2892            STORE( &r[chan_index], 0, chan_index );
2893         }
2894      }
2895      break;
2896
2897   case TGSI_OPCODE_DIV:
2898      assert( 0 );
2899      break;
2900
2901   case TGSI_OPCODE_DP2:
2902      FETCH( &r[0], 0, CHAN_X );
2903      FETCH( &r[1], 1, CHAN_X );
2904      micro_mul( &r[0], &r[0], &r[1] );
2905
2906      FETCH( &r[1], 0, CHAN_Y );
2907      FETCH( &r[2], 1, CHAN_Y );
2908      micro_mul( &r[1], &r[1], &r[2] );
2909      micro_add( &r[0], &r[0], &r[1] );
2910
2911      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2912         STORE( &r[0], 0, chan_index );
2913      }
2914      break;
2915
2916   case TGSI_OPCODE_IF:
2917      /* push CondMask */
2918      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2919      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2920      FETCH( &r[0], 0, CHAN_X );
2921      /* update CondMask */
2922      if( ! r[0].u[0] ) {
2923         mach->CondMask &= ~0x1;
2924      }
2925      if( ! r[0].u[1] ) {
2926         mach->CondMask &= ~0x2;
2927      }
2928      if( ! r[0].u[2] ) {
2929         mach->CondMask &= ~0x4;
2930      }
2931      if( ! r[0].u[3] ) {
2932         mach->CondMask &= ~0x8;
2933      }
2934      UPDATE_EXEC_MASK(mach);
2935      /* Todo: If CondMask==0, jump to ELSE */
2936      break;
2937
2938   case TGSI_OPCODE_ELSE:
2939      /* invert CondMask wrt previous mask */
2940      {
2941         uint prevMask;
2942         assert(mach->CondStackTop > 0);
2943         prevMask = mach->CondStack[mach->CondStackTop - 1];
2944         mach->CondMask = ~mach->CondMask & prevMask;
2945         UPDATE_EXEC_MASK(mach);
2946         /* Todo: If CondMask==0, jump to ENDIF */
2947      }
2948      break;
2949
2950   case TGSI_OPCODE_ENDIF:
2951      /* pop CondMask */
2952      assert(mach->CondStackTop > 0);
2953      mach->CondMask = mach->CondStack[--mach->CondStackTop];
2954      UPDATE_EXEC_MASK(mach);
2955      break;
2956
2957   case TGSI_OPCODE_END:
2958      /* halt execution */
2959      *pc = -1;
2960      break;
2961
2962   case TGSI_OPCODE_REP:
2963      assert (0);
2964      break;
2965
2966   case TGSI_OPCODE_ENDREP:
2967       assert (0);
2968       break;
2969
2970   case TGSI_OPCODE_PUSHA:
2971      assert (0);
2972      break;
2973
2974   case TGSI_OPCODE_POPA:
2975      assert (0);
2976      break;
2977
2978   case TGSI_OPCODE_CEIL:
2979      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2980         FETCH( &r[0], 0, chan_index );
2981         micro_ceil(&d[chan_index], &r[0]);
2982      }
2983      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2984         STORE(&d[chan_index], 0, chan_index);
2985      }
2986      break;
2987
2988   case TGSI_OPCODE_I2F:
2989      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2990         FETCH( &r[0], 0, chan_index );
2991         micro_i2f(&d[chan_index], &r[0]);
2992      }
2993      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2994         STORE(&d[chan_index], 0, chan_index);
2995      }
2996      break;
2997
2998   case TGSI_OPCODE_NOT:
2999      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3000         FETCH( &r[0], 0, chan_index );
3001         micro_not(&d[chan_index], &r[0]);
3002      }
3003      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3004         STORE(&d[chan_index], 0, chan_index);
3005      }
3006      break;
3007
3008   case TGSI_OPCODE_TRUNC:
3009      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3010         FETCH( &r[0], 0, chan_index );
3011         micro_trunc(&d[chan_index], &r[0]);
3012      }
3013      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3014         STORE(&d[chan_index], 0, chan_index);
3015      }
3016      break;
3017
3018   case TGSI_OPCODE_SHL:
3019      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3020         FETCH( &r[0], 0, chan_index );
3021         FETCH( &r[1], 1, chan_index );
3022         micro_shl(&d[chan_index], &r[0], &r[1]);
3023      }
3024      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3025         STORE(&d[chan_index], 0, chan_index);
3026      }
3027      break;
3028
3029   case TGSI_OPCODE_SHR:
3030      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3031         FETCH( &r[0], 0, chan_index );
3032         FETCH( &r[1], 1, chan_index );
3033         micro_ishr(&d[chan_index], &r[0], &r[1]);
3034      }
3035      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3036         STORE(&d[chan_index], 0, chan_index);
3037      }
3038      break;
3039
3040   case TGSI_OPCODE_AND:
3041      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3042         FETCH( &r[0], 0, chan_index );
3043         FETCH( &r[1], 1, chan_index );
3044         micro_and(&d[chan_index], &r[0], &r[1]);
3045      }
3046      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3047         STORE(&d[chan_index], 0, chan_index);
3048      }
3049      break;
3050
3051   case TGSI_OPCODE_OR:
3052      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3053         FETCH( &r[0], 0, chan_index );
3054         FETCH( &r[1], 1, chan_index );
3055         micro_or(&d[chan_index], &r[0], &r[1]);
3056      }
3057      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3058         STORE(&d[chan_index], 0, chan_index);
3059      }
3060      break;
3061
3062   case TGSI_OPCODE_MOD:
3063      assert (0);
3064      break;
3065
3066   case TGSI_OPCODE_XOR:
3067      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3068         FETCH( &r[0], 0, chan_index );
3069         FETCH( &r[1], 1, chan_index );
3070         micro_xor(&d[chan_index], &r[0], &r[1]);
3071      }
3072      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3073         STORE(&d[chan_index], 0, chan_index);
3074      }
3075      break;
3076
3077   case TGSI_OPCODE_SAD:
3078      assert (0);
3079      break;
3080
3081   case TGSI_OPCODE_TXF:
3082      assert (0);
3083      break;
3084
3085   case TGSI_OPCODE_TXQ:
3086      assert (0);
3087      break;
3088
3089   case TGSI_OPCODE_EMIT:
3090      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
3091      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
3092      break;
3093
3094   case TGSI_OPCODE_ENDPRIM:
3095      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
3096      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
3097      break;
3098
3099   case TGSI_OPCODE_BGNFOR:
3100      assert(mach->LoopCounterStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3101      for (chan_index = 0; chan_index < 3; chan_index++) {
3102         FETCH( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[chan_index], 0, chan_index );
3103      }
3104      ++mach->LoopCounterStackTop;
3105      STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X], 0, CHAN_X);
3106      /* update LoopMask */
3107      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3108         mach->LoopMask &= ~0x1;
3109      }
3110      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3111         mach->LoopMask &= ~0x2;
3112      }
3113      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3114         mach->LoopMask &= ~0x4;
3115      }
3116      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3117         mach->LoopMask &= ~0x8;
3118      }
3119      /* TODO: if mach->LoopMask == 0, jump to end of loop */
3120      UPDATE_EXEC_MASK(mach);
3121      /* fall-through (for now) */
3122   case TGSI_OPCODE_BGNLOOP:
3123      /* push LoopMask and ContMasks */
3124      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3125      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3126      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3127      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3128      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3129      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3130      break;
3131
3132   case TGSI_OPCODE_ENDFOR:
3133      assert(mach->LoopCounterStackTop > 0);
3134      micro_sub(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3135                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3136                &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
3137      /* update LoopMask */
3138      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3139         mach->LoopMask &= ~0x1;
3140      }
3141      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3142         mach->LoopMask &= ~0x2;
3143      }
3144      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3145         mach->LoopMask &= ~0x4;
3146      }
3147      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3148         mach->LoopMask &= ~0x8;
3149      }
3150      micro_add(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3151                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3152                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Z]);
3153      assert(mach->LoopLabelStackTop > 0);
3154      inst = mach->Instructions + mach->LoopLabelStack[mach->LoopLabelStackTop - 1];
3155      STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_X], 0, CHAN_X);
3156      /* Restore ContMask, but don't pop */
3157      assert(mach->ContStackTop > 0);
3158      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3159      UPDATE_EXEC_MASK(mach);
3160      if (mach->ExecMask) {
3161         /* repeat loop: jump to instruction just past BGNLOOP */
3162         assert(mach->LoopLabelStackTop > 0);
3163         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3164      }
3165      else {
3166         /* exit loop: pop LoopMask */
3167         assert(mach->LoopStackTop > 0);
3168         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3169         /* pop ContMask */
3170         assert(mach->ContStackTop > 0);
3171         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3172         assert(mach->LoopLabelStackTop > 0);
3173         --mach->LoopLabelStackTop;
3174         assert(mach->LoopCounterStackTop > 0);
3175         --mach->LoopCounterStackTop;
3176      }
3177      UPDATE_EXEC_MASK(mach);
3178      break;
3179
3180   case TGSI_OPCODE_ENDLOOP:
3181      /* Restore ContMask, but don't pop */
3182      assert(mach->ContStackTop > 0);
3183      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3184      UPDATE_EXEC_MASK(mach);
3185      if (mach->ExecMask) {
3186         /* repeat loop: jump to instruction just past BGNLOOP */
3187         assert(mach->LoopLabelStackTop > 0);
3188         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3189      }
3190      else {
3191         /* exit loop: pop LoopMask */
3192         assert(mach->LoopStackTop > 0);
3193         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3194         /* pop ContMask */
3195         assert(mach->ContStackTop > 0);
3196         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3197         assert(mach->LoopLabelStackTop > 0);
3198         --mach->LoopLabelStackTop;
3199      }
3200      UPDATE_EXEC_MASK(mach);
3201      break;
3202
3203   case TGSI_OPCODE_BRK:
3204      /* turn off loop channels for each enabled exec channel */
3205      mach->LoopMask &= ~mach->ExecMask;
3206      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3207      UPDATE_EXEC_MASK(mach);
3208      break;
3209
3210   case TGSI_OPCODE_CONT:
3211      /* turn off cont channels for each enabled exec channel */
3212      mach->ContMask &= ~mach->ExecMask;
3213      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3214      UPDATE_EXEC_MASK(mach);
3215      break;
3216
3217   case TGSI_OPCODE_BGNSUB:
3218      /* no-op */
3219      break;
3220
3221   case TGSI_OPCODE_ENDSUB:
3222      /*
3223       * XXX: This really should be a no-op. We should never reach this opcode.
3224       */
3225
3226      assert(mach->CallStackTop > 0);
3227      mach->CallStackTop--;
3228
3229      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3230      mach->CondMask = mach->CondStack[mach->CondStackTop];
3231
3232      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3233      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3234
3235      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3236      mach->ContMask = mach->ContStack[mach->ContStackTop];
3237
3238      assert(mach->FuncStackTop > 0);
3239      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3240
3241      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3242
3243      UPDATE_EXEC_MASK(mach);
3244      break;
3245
3246   case TGSI_OPCODE_NOP:
3247      break;
3248
3249   default:
3250      assert( 0 );
3251   }
3252}
3253
3254#define DEBUG_EXECUTION 0
3255
3256
3257/**
3258 * Run TGSI interpreter.
3259 * \return bitmask of "alive" quad components
3260 */
3261uint
3262tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
3263{
3264   uint i;
3265   int pc = 0;
3266
3267   mach->CondMask = 0xf;
3268   mach->LoopMask = 0xf;
3269   mach->ContMask = 0xf;
3270   mach->FuncMask = 0xf;
3271   mach->ExecMask = 0xf;
3272
3273   assert(mach->CondStackTop == 0);
3274   assert(mach->LoopStackTop == 0);
3275   assert(mach->ContStackTop == 0);
3276   assert(mach->CallStackTop == 0);
3277
3278   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
3279   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
3280
3281   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
3282      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
3283      mach->Primitives[0] = 0;
3284   }
3285
3286   for (i = 0; i < QUAD_SIZE; i++) {
3287      mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
3288         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
3289         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
3290         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
3291         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
3292   }
3293
3294   /* execute declarations (interpolants) */
3295   for (i = 0; i < mach->NumDeclarations; i++) {
3296      exec_declaration( mach, mach->Declarations+i );
3297   }
3298
3299   {
3300#if DEBUG_EXECUTION
3301      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
3302      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
3303      uint inst = 1;
3304
3305      memcpy(temps, mach->Temps, sizeof(temps));
3306      memcpy(outputs, mach->Outputs, sizeof(outputs));
3307#endif
3308
3309      /* execute instructions, until pc is set to -1 */
3310      while (pc != -1) {
3311
3312#if DEBUG_EXECUTION
3313         uint i;
3314
3315         tgsi_dump_instruction(&mach->Instructions[pc], inst++);
3316#endif
3317
3318         assert(pc < (int) mach->NumInstructions);
3319         exec_instruction(mach, mach->Instructions + pc, &pc);
3320
3321#if DEBUG_EXECUTION
3322         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
3323            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
3324               uint j;
3325
3326               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
3327               debug_printf("TEMP[%2u] = ", i);
3328               for (j = 0; j < 4; j++) {
3329                  if (j > 0) {
3330                     debug_printf("           ");
3331                  }
3332                  debug_printf("(%6f, %6f, %6f, %6f)\n",
3333                               temps[i].xyzw[0].f[j],
3334                               temps[i].xyzw[1].f[j],
3335                               temps[i].xyzw[2].f[j],
3336                               temps[i].xyzw[3].f[j]);
3337               }
3338            }
3339         }
3340         for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
3341            if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
3342               uint j;
3343
3344               memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
3345               debug_printf("OUT[%2u] =  ", i);
3346               for (j = 0; j < 4; j++) {
3347                  if (j > 0) {
3348                     debug_printf("           ");
3349                  }
3350                  debug_printf("{%6f, %6f, %6f, %6f}\n",
3351                               outputs[i].xyzw[0].f[j],
3352                               outputs[i].xyzw[1].f[j],
3353                               outputs[i].xyzw[2].f[j],
3354                               outputs[i].xyzw[3].f[j]);
3355               }
3356            }
3357         }
3358#endif
3359      }
3360   }
3361
3362#if 0
3363   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
3364   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
3365      /*
3366       * Scale back depth component.
3367       */
3368      for (i = 0; i < 4; i++)
3369         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
3370   }
3371#endif
3372
3373   assert(mach->CondStackTop == 0);
3374   assert(mach->LoopStackTop == 0);
3375   assert(mach->ContStackTop == 0);
3376   assert(mach->CallStackTop == 0);
3377
3378   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3379}
3380