tgsi_exec.c revision cde758a2b50da8d7a8db5467f5629ce366380c41
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28/**
29 * TGSI interpreter/executor.
30 *
31 * Flow control information:
32 *
33 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
34 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
35 * care since a condition may be true for some quad components but false
36 * for other components.
37 *
38 * We basically execute all statements (even if they're in the part of
39 * an IF/ELSE clause that's "not taken") and use a special mask to
40 * control writing to destination registers.  This is the ExecMask.
41 * See store_dest().
42 *
43 * The ExecMask is computed from three other masks (CondMask, LoopMask and
44 * ContMask) which are controlled by the flow control instructions (namely:
45 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
46 *
47 *
48 * Authors:
49 *   Michal Krol
50 *   Brian Paul
51 */
52
53#include "pipe/p_compiler.h"
54#include "pipe/p_state.h"
55#include "pipe/p_shader_tokens.h"
56#include "tgsi/tgsi_dump.h"
57#include "tgsi/tgsi_parse.h"
58#include "tgsi/tgsi_util.h"
59#include "tgsi_exec.h"
60#include "util/u_memory.h"
61#include "util/u_math.h"
62
63#define FAST_MATH 1
64
65#define TILE_TOP_LEFT     0
66#define TILE_TOP_RIGHT    1
67#define TILE_BOTTOM_LEFT  2
68#define TILE_BOTTOM_RIGHT 3
69
70#define CHAN_X  0
71#define CHAN_Y  1
72#define CHAN_Z  2
73#define CHAN_W  3
74
75/*
76 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
77 */
78#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
79#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
80#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
81#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
82#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
83#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
84#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
85#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
86#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
87#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
88#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
89#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
90#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
91#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
92#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
93#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
94#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
95#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
96#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
97#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
98#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
99#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
100#define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
101#define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
102#define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
103#define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
104#define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
105#define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
106#define TEMP_R0            TGSI_EXEC_TEMP_R0
107#define TEMP_P0            TGSI_EXEC_TEMP_P0
108
109#define IS_CHANNEL_ENABLED(INST, CHAN)\
110   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
111
112#define IS_CHANNEL_ENABLED2(INST, CHAN)\
113   ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
114
115#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
116   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
117      if (IS_CHANNEL_ENABLED( INST, CHAN ))
118
119#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
120   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
121      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
122
123
124/** The execution mask depends on the conditional mask and the loop mask */
125#define UPDATE_EXEC_MASK(MACH) \
126      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
127
128
129static const union tgsi_exec_channel ZeroVec =
130   { { 0.0, 0.0, 0.0, 0.0 } };
131
132
133#ifdef DEBUG
134static void
135check_inf_or_nan(const union tgsi_exec_channel *chan)
136{
137   assert(!util_is_inf_or_nan(chan->f[0]));
138   assert(!util_is_inf_or_nan(chan->f[1]));
139   assert(!util_is_inf_or_nan(chan->f[2]));
140   assert(!util_is_inf_or_nan(chan->f[3]));
141}
142#endif
143
144
145#ifdef DEBUG
146static void
147print_chan(const char *msg, const union tgsi_exec_channel *chan)
148{
149   debug_printf("%s = {%f, %f, %f, %f}\n",
150                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
151}
152#endif
153
154
155#ifdef DEBUG
156static void
157print_temp(const struct tgsi_exec_machine *mach, uint index)
158{
159   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
160   int i;
161   debug_printf("Temp[%u] =\n", index);
162   for (i = 0; i < 4; i++) {
163      debug_printf("  %c: { %f, %f, %f, %f }\n",
164                   "XYZW"[i],
165                   tmp->xyzw[i].f[0],
166                   tmp->xyzw[i].f[1],
167                   tmp->xyzw[i].f[2],
168                   tmp->xyzw[i].f[3]);
169   }
170}
171#endif
172
173
174/**
175 * Check if there's a potential src/dst register data dependency when
176 * using SOA execution.
177 * Example:
178 *   MOV T, T.yxwz;
179 * This would expand into:
180 *   MOV t0, t1;
181 *   MOV t1, t0;
182 *   MOV t2, t3;
183 *   MOV t3, t2;
184 * The second instruction will have the wrong value for t0 if executed as-is.
185 */
186boolean
187tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
188{
189   uint i, chan;
190
191   uint writemask = inst->FullDstRegisters[0].DstRegister.WriteMask;
192   if (writemask == TGSI_WRITEMASK_X ||
193       writemask == TGSI_WRITEMASK_Y ||
194       writemask == TGSI_WRITEMASK_Z ||
195       writemask == TGSI_WRITEMASK_W ||
196       writemask == TGSI_WRITEMASK_NONE) {
197      /* no chance of data dependency */
198      return FALSE;
199   }
200
201   /* loop over src regs */
202   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
203      if ((inst->FullSrcRegisters[i].SrcRegister.File ==
204           inst->FullDstRegisters[0].DstRegister.File) &&
205          (inst->FullSrcRegisters[i].SrcRegister.Index ==
206           inst->FullDstRegisters[0].DstRegister.Index)) {
207         /* loop over dest channels */
208         uint channelsWritten = 0x0;
209         FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
210            /* check if we're reading a channel that's been written */
211            uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->FullSrcRegisters[i], chan);
212            if (channelsWritten & (1 << swizzle)) {
213               return TRUE;
214            }
215
216            channelsWritten |= (1 << chan);
217         }
218      }
219   }
220   return FALSE;
221}
222
223
224/**
225 * Initialize machine state by expanding tokens to full instructions,
226 * allocating temporary storage, setting up constants, etc.
227 * After this, we can call tgsi_exec_machine_run() many times.
228 */
229void
230tgsi_exec_machine_bind_shader(
231   struct tgsi_exec_machine *mach,
232   const struct tgsi_token *tokens,
233   uint numSamplers,
234   struct tgsi_sampler **samplers)
235{
236   uint k;
237   struct tgsi_parse_context parse;
238   struct tgsi_exec_labels *labels = &mach->Labels;
239   struct tgsi_full_instruction *instructions;
240   struct tgsi_full_declaration *declarations;
241   uint maxInstructions = 10, numInstructions = 0;
242   uint maxDeclarations = 10, numDeclarations = 0;
243   uint instno = 0;
244
245#if 0
246   tgsi_dump(tokens, 0);
247#endif
248
249   util_init_math();
250
251   mach->Tokens = tokens;
252   mach->Samplers = samplers;
253
254   k = tgsi_parse_init (&parse, mach->Tokens);
255   if (k != TGSI_PARSE_OK) {
256      debug_printf( "Problem parsing!\n" );
257      return;
258   }
259
260   mach->Processor = parse.FullHeader.Processor.Processor;
261   mach->ImmLimit = 0;
262   labels->count = 0;
263
264   declarations = (struct tgsi_full_declaration *)
265      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
266
267   if (!declarations) {
268      return;
269   }
270
271   instructions = (struct tgsi_full_instruction *)
272      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
273
274   if (!instructions) {
275      FREE( declarations );
276      return;
277   }
278
279   while( !tgsi_parse_end_of_tokens( &parse ) ) {
280      uint pointer = parse.Position;
281      uint i;
282
283      tgsi_parse_token( &parse );
284      switch( parse.FullToken.Token.Type ) {
285      case TGSI_TOKEN_TYPE_DECLARATION:
286         /* save expanded declaration */
287         if (numDeclarations == maxDeclarations) {
288            declarations = REALLOC(declarations,
289                                   maxDeclarations
290                                   * sizeof(struct tgsi_full_declaration),
291                                   (maxDeclarations + 10)
292                                   * sizeof(struct tgsi_full_declaration));
293            maxDeclarations += 10;
294         }
295         memcpy(declarations + numDeclarations,
296                &parse.FullToken.FullDeclaration,
297                sizeof(declarations[0]));
298         numDeclarations++;
299         break;
300
301      case TGSI_TOKEN_TYPE_IMMEDIATE:
302         {
303            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
304            assert( size <= 4 );
305            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
306
307            for( i = 0; i < size; i++ ) {
308               mach->Imms[mach->ImmLimit][i] =
309		  parse.FullToken.FullImmediate.u[i].Float;
310            }
311            mach->ImmLimit += 1;
312         }
313         break;
314
315      case TGSI_TOKEN_TYPE_INSTRUCTION:
316         assert( labels->count < MAX_LABELS );
317
318         labels->labels[labels->count][0] = instno;
319         labels->labels[labels->count][1] = pointer;
320         labels->count++;
321
322         /* save expanded instruction */
323         if (numInstructions == maxInstructions) {
324            instructions = REALLOC(instructions,
325                                   maxInstructions
326                                   * sizeof(struct tgsi_full_instruction),
327                                   (maxInstructions + 10)
328                                   * sizeof(struct tgsi_full_instruction));
329            maxInstructions += 10;
330         }
331
332         memcpy(instructions + numInstructions,
333                &parse.FullToken.FullInstruction,
334                sizeof(instructions[0]));
335
336         numInstructions++;
337         break;
338
339      default:
340         assert( 0 );
341      }
342   }
343   tgsi_parse_free (&parse);
344
345   if (mach->Declarations) {
346      FREE( mach->Declarations );
347   }
348   mach->Declarations = declarations;
349   mach->NumDeclarations = numDeclarations;
350
351   if (mach->Instructions) {
352      FREE( mach->Instructions );
353   }
354   mach->Instructions = instructions;
355   mach->NumInstructions = numInstructions;
356}
357
358
359struct tgsi_exec_machine *
360tgsi_exec_machine_create( void )
361{
362   struct tgsi_exec_machine *mach;
363   uint i;
364
365   mach = align_malloc( sizeof *mach, 16 );
366   if (!mach)
367      goto fail;
368
369   memset(mach, 0, sizeof(*mach));
370
371   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
372
373   /* Setup constants. */
374   for( i = 0; i < 4; i++ ) {
375      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
376      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
377      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
378      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
379      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
380      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
381      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
382      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
383      mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
384      mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
385   }
386
387#ifdef DEBUG
388   /* silence warnings */
389   (void) print_chan;
390   (void) print_temp;
391#endif
392
393   return mach;
394
395fail:
396   align_free(mach);
397   return NULL;
398}
399
400
401void
402tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
403{
404   if (mach) {
405      FREE(mach->Instructions);
406      FREE(mach->Declarations);
407   }
408
409   align_free(mach);
410}
411
412
413static void
414micro_abs(
415   union tgsi_exec_channel *dst,
416   const union tgsi_exec_channel *src )
417{
418   dst->f[0] = fabsf( src->f[0] );
419   dst->f[1] = fabsf( src->f[1] );
420   dst->f[2] = fabsf( src->f[2] );
421   dst->f[3] = fabsf( src->f[3] );
422}
423
424static void
425micro_add(
426   union tgsi_exec_channel *dst,
427   const union tgsi_exec_channel *src0,
428   const union tgsi_exec_channel *src1 )
429{
430   dst->f[0] = src0->f[0] + src1->f[0];
431   dst->f[1] = src0->f[1] + src1->f[1];
432   dst->f[2] = src0->f[2] + src1->f[2];
433   dst->f[3] = src0->f[3] + src1->f[3];
434}
435
436#if 0
437static void
438micro_iadd(
439   union tgsi_exec_channel *dst,
440   const union tgsi_exec_channel *src0,
441   const union tgsi_exec_channel *src1 )
442{
443   dst->i[0] = src0->i[0] + src1->i[0];
444   dst->i[1] = src0->i[1] + src1->i[1];
445   dst->i[2] = src0->i[2] + src1->i[2];
446   dst->i[3] = src0->i[3] + src1->i[3];
447}
448#endif
449
450static void
451micro_and(
452   union tgsi_exec_channel *dst,
453   const union tgsi_exec_channel *src0,
454   const union tgsi_exec_channel *src1 )
455{
456   dst->u[0] = src0->u[0] & src1->u[0];
457   dst->u[1] = src0->u[1] & src1->u[1];
458   dst->u[2] = src0->u[2] & src1->u[2];
459   dst->u[3] = src0->u[3] & src1->u[3];
460}
461
462static void
463micro_ceil(
464   union tgsi_exec_channel *dst,
465   const union tgsi_exec_channel *src )
466{
467   dst->f[0] = ceilf( src->f[0] );
468   dst->f[1] = ceilf( src->f[1] );
469   dst->f[2] = ceilf( src->f[2] );
470   dst->f[3] = ceilf( src->f[3] );
471}
472
473static void
474micro_cos(
475   union tgsi_exec_channel *dst,
476   const union tgsi_exec_channel *src )
477{
478   dst->f[0] = cosf( src->f[0] );
479   dst->f[1] = cosf( src->f[1] );
480   dst->f[2] = cosf( src->f[2] );
481   dst->f[3] = cosf( src->f[3] );
482}
483
484static void
485micro_ddx(
486   union tgsi_exec_channel *dst,
487   const union tgsi_exec_channel *src )
488{
489   dst->f[0] =
490   dst->f[1] =
491   dst->f[2] =
492   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
493}
494
495static void
496micro_ddy(
497   union tgsi_exec_channel *dst,
498   const union tgsi_exec_channel *src )
499{
500   dst->f[0] =
501   dst->f[1] =
502   dst->f[2] =
503   dst->f[3] = src->f[TILE_TOP_LEFT] - src->f[TILE_BOTTOM_LEFT];
504}
505
506static void
507micro_div(
508   union tgsi_exec_channel *dst,
509   const union tgsi_exec_channel *src0,
510   const union tgsi_exec_channel *src1 )
511{
512   if (src1->f[0] != 0) {
513      dst->f[0] = src0->f[0] / src1->f[0];
514   }
515   if (src1->f[1] != 0) {
516      dst->f[1] = src0->f[1] / src1->f[1];
517   }
518   if (src1->f[2] != 0) {
519      dst->f[2] = src0->f[2] / src1->f[2];
520   }
521   if (src1->f[3] != 0) {
522      dst->f[3] = src0->f[3] / src1->f[3];
523   }
524}
525
526#if 0
527static void
528micro_udiv(
529   union tgsi_exec_channel *dst,
530   const union tgsi_exec_channel *src0,
531   const union tgsi_exec_channel *src1 )
532{
533   dst->u[0] = src0->u[0] / src1->u[0];
534   dst->u[1] = src0->u[1] / src1->u[1];
535   dst->u[2] = src0->u[2] / src1->u[2];
536   dst->u[3] = src0->u[3] / src1->u[3];
537}
538#endif
539
540static void
541micro_eq(
542   union tgsi_exec_channel *dst,
543   const union tgsi_exec_channel *src0,
544   const union tgsi_exec_channel *src1,
545   const union tgsi_exec_channel *src2,
546   const union tgsi_exec_channel *src3 )
547{
548   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
549   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
550   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
551   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
552}
553
554#if 0
555static void
556micro_ieq(
557   union tgsi_exec_channel *dst,
558   const union tgsi_exec_channel *src0,
559   const union tgsi_exec_channel *src1,
560   const union tgsi_exec_channel *src2,
561   const union tgsi_exec_channel *src3 )
562{
563   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
564   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
565   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
566   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
567}
568#endif
569
570static void
571micro_exp2(
572   union tgsi_exec_channel *dst,
573   const union tgsi_exec_channel *src)
574{
575#if FAST_MATH
576   dst->f[0] = util_fast_exp2( src->f[0] );
577   dst->f[1] = util_fast_exp2( src->f[1] );
578   dst->f[2] = util_fast_exp2( src->f[2] );
579   dst->f[3] = util_fast_exp2( src->f[3] );
580#else
581   dst->f[0] = powf( 2.0f, src->f[0] );
582   dst->f[1] = powf( 2.0f, src->f[1] );
583   dst->f[2] = powf( 2.0f, src->f[2] );
584   dst->f[3] = powf( 2.0f, src->f[3] );
585#endif
586}
587
588#if 0
589static void
590micro_f2ut(
591   union tgsi_exec_channel *dst,
592   const union tgsi_exec_channel *src )
593{
594   dst->u[0] = (uint) src->f[0];
595   dst->u[1] = (uint) src->f[1];
596   dst->u[2] = (uint) src->f[2];
597   dst->u[3] = (uint) src->f[3];
598}
599#endif
600
601static void
602micro_float_clamp(union tgsi_exec_channel *dst,
603                  const union tgsi_exec_channel *src)
604{
605   uint i;
606
607   for (i = 0; i < 4; i++) {
608      if (src->f[i] > 0.0f) {
609         if (src->f[i] > 1.884467e+019f)
610            dst->f[i] = 1.884467e+019f;
611         else if (src->f[i] < 5.42101e-020f)
612            dst->f[i] = 5.42101e-020f;
613         else
614            dst->f[i] = src->f[i];
615      }
616      else {
617         if (src->f[i] < -1.884467e+019f)
618            dst->f[i] = -1.884467e+019f;
619         else if (src->f[i] > -5.42101e-020f)
620            dst->f[i] = -5.42101e-020f;
621         else
622            dst->f[i] = src->f[i];
623      }
624   }
625}
626
627static void
628micro_flr(
629   union tgsi_exec_channel *dst,
630   const union tgsi_exec_channel *src )
631{
632   dst->f[0] = floorf( src->f[0] );
633   dst->f[1] = floorf( src->f[1] );
634   dst->f[2] = floorf( src->f[2] );
635   dst->f[3] = floorf( src->f[3] );
636}
637
638static void
639micro_frc(
640   union tgsi_exec_channel *dst,
641   const union tgsi_exec_channel *src )
642{
643   dst->f[0] = src->f[0] - floorf( src->f[0] );
644   dst->f[1] = src->f[1] - floorf( src->f[1] );
645   dst->f[2] = src->f[2] - floorf( src->f[2] );
646   dst->f[3] = src->f[3] - floorf( src->f[3] );
647}
648
649static void
650micro_i2f(
651   union tgsi_exec_channel *dst,
652   const union tgsi_exec_channel *src )
653{
654   dst->f[0] = (float) src->i[0];
655   dst->f[1] = (float) src->i[1];
656   dst->f[2] = (float) src->i[2];
657   dst->f[3] = (float) src->i[3];
658}
659
660static void
661micro_lg2(
662   union tgsi_exec_channel *dst,
663   const union tgsi_exec_channel *src )
664{
665#if FAST_MATH
666   dst->f[0] = util_fast_log2( src->f[0] );
667   dst->f[1] = util_fast_log2( src->f[1] );
668   dst->f[2] = util_fast_log2( src->f[2] );
669   dst->f[3] = util_fast_log2( src->f[3] );
670#else
671   dst->f[0] = logf( src->f[0] ) * 1.442695f;
672   dst->f[1] = logf( src->f[1] ) * 1.442695f;
673   dst->f[2] = logf( src->f[2] ) * 1.442695f;
674   dst->f[3] = logf( src->f[3] ) * 1.442695f;
675#endif
676}
677
678static void
679micro_le(
680   union tgsi_exec_channel *dst,
681   const union tgsi_exec_channel *src0,
682   const union tgsi_exec_channel *src1,
683   const union tgsi_exec_channel *src2,
684   const union tgsi_exec_channel *src3 )
685{
686   dst->f[0] = src0->f[0] <= src1->f[0] ? src2->f[0] : src3->f[0];
687   dst->f[1] = src0->f[1] <= src1->f[1] ? src2->f[1] : src3->f[1];
688   dst->f[2] = src0->f[2] <= src1->f[2] ? src2->f[2] : src3->f[2];
689   dst->f[3] = src0->f[3] <= src1->f[3] ? src2->f[3] : src3->f[3];
690}
691
692static void
693micro_lt(
694   union tgsi_exec_channel *dst,
695   const union tgsi_exec_channel *src0,
696   const union tgsi_exec_channel *src1,
697   const union tgsi_exec_channel *src2,
698   const union tgsi_exec_channel *src3 )
699{
700   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
701   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
702   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
703   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
704}
705
706#if 0
707static void
708micro_ilt(
709   union tgsi_exec_channel *dst,
710   const union tgsi_exec_channel *src0,
711   const union tgsi_exec_channel *src1,
712   const union tgsi_exec_channel *src2,
713   const union tgsi_exec_channel *src3 )
714{
715   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
716   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
717   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
718   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
719}
720#endif
721
722#if 0
723static void
724micro_ult(
725   union tgsi_exec_channel *dst,
726   const union tgsi_exec_channel *src0,
727   const union tgsi_exec_channel *src1,
728   const union tgsi_exec_channel *src2,
729   const union tgsi_exec_channel *src3 )
730{
731   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
732   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
733   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
734   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
735}
736#endif
737
738static void
739micro_max(
740   union tgsi_exec_channel *dst,
741   const union tgsi_exec_channel *src0,
742   const union tgsi_exec_channel *src1 )
743{
744   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
745   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
746   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
747   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
748}
749
750#if 0
751static void
752micro_imax(
753   union tgsi_exec_channel *dst,
754   const union tgsi_exec_channel *src0,
755   const union tgsi_exec_channel *src1 )
756{
757   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
758   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
759   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
760   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
761}
762#endif
763
764#if 0
765static void
766micro_umax(
767   union tgsi_exec_channel *dst,
768   const union tgsi_exec_channel *src0,
769   const union tgsi_exec_channel *src1 )
770{
771   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
772   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
773   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
774   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
775}
776#endif
777
778static void
779micro_min(
780   union tgsi_exec_channel *dst,
781   const union tgsi_exec_channel *src0,
782   const union tgsi_exec_channel *src1 )
783{
784   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
785   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
786   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
787   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
788}
789
790#if 0
791static void
792micro_imin(
793   union tgsi_exec_channel *dst,
794   const union tgsi_exec_channel *src0,
795   const union tgsi_exec_channel *src1 )
796{
797   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
798   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
799   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
800   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
801}
802#endif
803
804#if 0
805static void
806micro_umin(
807   union tgsi_exec_channel *dst,
808   const union tgsi_exec_channel *src0,
809   const union tgsi_exec_channel *src1 )
810{
811   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
812   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
813   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
814   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
815}
816#endif
817
818#if 0
819static void
820micro_umod(
821   union tgsi_exec_channel *dst,
822   const union tgsi_exec_channel *src0,
823   const union tgsi_exec_channel *src1 )
824{
825   dst->u[0] = src0->u[0] % src1->u[0];
826   dst->u[1] = src0->u[1] % src1->u[1];
827   dst->u[2] = src0->u[2] % src1->u[2];
828   dst->u[3] = src0->u[3] % src1->u[3];
829}
830#endif
831
832static void
833micro_mul(
834   union tgsi_exec_channel *dst,
835   const union tgsi_exec_channel *src0,
836   const union tgsi_exec_channel *src1 )
837{
838   dst->f[0] = src0->f[0] * src1->f[0];
839   dst->f[1] = src0->f[1] * src1->f[1];
840   dst->f[2] = src0->f[2] * src1->f[2];
841   dst->f[3] = src0->f[3] * src1->f[3];
842}
843
844#if 0
845static void
846micro_imul(
847   union tgsi_exec_channel *dst,
848   const union tgsi_exec_channel *src0,
849   const union tgsi_exec_channel *src1 )
850{
851   dst->i[0] = src0->i[0] * src1->i[0];
852   dst->i[1] = src0->i[1] * src1->i[1];
853   dst->i[2] = src0->i[2] * src1->i[2];
854   dst->i[3] = src0->i[3] * src1->i[3];
855}
856#endif
857
858#if 0
859static void
860micro_imul64(
861   union tgsi_exec_channel *dst0,
862   union tgsi_exec_channel *dst1,
863   const union tgsi_exec_channel *src0,
864   const union tgsi_exec_channel *src1 )
865{
866   dst1->i[0] = src0->i[0] * src1->i[0];
867   dst1->i[1] = src0->i[1] * src1->i[1];
868   dst1->i[2] = src0->i[2] * src1->i[2];
869   dst1->i[3] = src0->i[3] * src1->i[3];
870   dst0->i[0] = 0;
871   dst0->i[1] = 0;
872   dst0->i[2] = 0;
873   dst0->i[3] = 0;
874}
875#endif
876
877#if 0
878static void
879micro_umul64(
880   union tgsi_exec_channel *dst0,
881   union tgsi_exec_channel *dst1,
882   const union tgsi_exec_channel *src0,
883   const union tgsi_exec_channel *src1 )
884{
885   dst1->u[0] = src0->u[0] * src1->u[0];
886   dst1->u[1] = src0->u[1] * src1->u[1];
887   dst1->u[2] = src0->u[2] * src1->u[2];
888   dst1->u[3] = src0->u[3] * src1->u[3];
889   dst0->u[0] = 0;
890   dst0->u[1] = 0;
891   dst0->u[2] = 0;
892   dst0->u[3] = 0;
893}
894#endif
895
896
897#if 0
898static void
899micro_movc(
900   union tgsi_exec_channel *dst,
901   const union tgsi_exec_channel *src0,
902   const union tgsi_exec_channel *src1,
903   const union tgsi_exec_channel *src2 )
904{
905   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
906   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
907   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
908   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
909}
910#endif
911
912static void
913micro_neg(
914   union tgsi_exec_channel *dst,
915   const union tgsi_exec_channel *src )
916{
917   dst->f[0] = -src->f[0];
918   dst->f[1] = -src->f[1];
919   dst->f[2] = -src->f[2];
920   dst->f[3] = -src->f[3];
921}
922
923#if 0
924static void
925micro_ineg(
926   union tgsi_exec_channel *dst,
927   const union tgsi_exec_channel *src )
928{
929   dst->i[0] = -src->i[0];
930   dst->i[1] = -src->i[1];
931   dst->i[2] = -src->i[2];
932   dst->i[3] = -src->i[3];
933}
934#endif
935
936static void
937micro_not(
938   union tgsi_exec_channel *dst,
939   const union tgsi_exec_channel *src )
940{
941   dst->u[0] = ~src->u[0];
942   dst->u[1] = ~src->u[1];
943   dst->u[2] = ~src->u[2];
944   dst->u[3] = ~src->u[3];
945}
946
947static void
948micro_or(
949   union tgsi_exec_channel *dst,
950   const union tgsi_exec_channel *src0,
951   const union tgsi_exec_channel *src1 )
952{
953   dst->u[0] = src0->u[0] | src1->u[0];
954   dst->u[1] = src0->u[1] | src1->u[1];
955   dst->u[2] = src0->u[2] | src1->u[2];
956   dst->u[3] = src0->u[3] | src1->u[3];
957}
958
959static void
960micro_pow(
961   union tgsi_exec_channel *dst,
962   const union tgsi_exec_channel *src0,
963   const union tgsi_exec_channel *src1 )
964{
965#if FAST_MATH
966   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
967   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
968   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
969   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
970#else
971   dst->f[0] = powf( src0->f[0], src1->f[0] );
972   dst->f[1] = powf( src0->f[1], src1->f[1] );
973   dst->f[2] = powf( src0->f[2], src1->f[2] );
974   dst->f[3] = powf( src0->f[3], src1->f[3] );
975#endif
976}
977
978static void
979micro_rnd(
980   union tgsi_exec_channel *dst,
981   const union tgsi_exec_channel *src )
982{
983   dst->f[0] = floorf( src->f[0] + 0.5f );
984   dst->f[1] = floorf( src->f[1] + 0.5f );
985   dst->f[2] = floorf( src->f[2] + 0.5f );
986   dst->f[3] = floorf( src->f[3] + 0.5f );
987}
988
989static void
990micro_sgn(
991   union tgsi_exec_channel *dst,
992   const union tgsi_exec_channel *src )
993{
994   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
995   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
996   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
997   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
998}
999
1000static void
1001micro_shl(
1002   union tgsi_exec_channel *dst,
1003   const union tgsi_exec_channel *src0,
1004   const union tgsi_exec_channel *src1 )
1005{
1006   dst->i[0] = src0->i[0] << src1->i[0];
1007   dst->i[1] = src0->i[1] << src1->i[1];
1008   dst->i[2] = src0->i[2] << src1->i[2];
1009   dst->i[3] = src0->i[3] << src1->i[3];
1010}
1011
1012static void
1013micro_ishr(
1014   union tgsi_exec_channel *dst,
1015   const union tgsi_exec_channel *src0,
1016   const union tgsi_exec_channel *src1 )
1017{
1018   dst->i[0] = src0->i[0] >> src1->i[0];
1019   dst->i[1] = src0->i[1] >> src1->i[1];
1020   dst->i[2] = src0->i[2] >> src1->i[2];
1021   dst->i[3] = src0->i[3] >> src1->i[3];
1022}
1023
1024static void
1025micro_trunc(
1026   union tgsi_exec_channel *dst,
1027   const union tgsi_exec_channel *src0 )
1028{
1029   dst->f[0] = (float) (int) src0->f[0];
1030   dst->f[1] = (float) (int) src0->f[1];
1031   dst->f[2] = (float) (int) src0->f[2];
1032   dst->f[3] = (float) (int) src0->f[3];
1033}
1034
1035#if 0
1036static void
1037micro_ushr(
1038   union tgsi_exec_channel *dst,
1039   const union tgsi_exec_channel *src0,
1040   const union tgsi_exec_channel *src1 )
1041{
1042   dst->u[0] = src0->u[0] >> src1->u[0];
1043   dst->u[1] = src0->u[1] >> src1->u[1];
1044   dst->u[2] = src0->u[2] >> src1->u[2];
1045   dst->u[3] = src0->u[3] >> src1->u[3];
1046}
1047#endif
1048
1049static void
1050micro_sin(
1051   union tgsi_exec_channel *dst,
1052   const union tgsi_exec_channel *src )
1053{
1054   dst->f[0] = sinf( src->f[0] );
1055   dst->f[1] = sinf( src->f[1] );
1056   dst->f[2] = sinf( src->f[2] );
1057   dst->f[3] = sinf( src->f[3] );
1058}
1059
1060static void
1061micro_sqrt( union tgsi_exec_channel *dst,
1062            const union tgsi_exec_channel *src )
1063{
1064   dst->f[0] = sqrtf( src->f[0] );
1065   dst->f[1] = sqrtf( src->f[1] );
1066   dst->f[2] = sqrtf( src->f[2] );
1067   dst->f[3] = sqrtf( src->f[3] );
1068}
1069
1070static void
1071micro_sub(
1072   union tgsi_exec_channel *dst,
1073   const union tgsi_exec_channel *src0,
1074   const union tgsi_exec_channel *src1 )
1075{
1076   dst->f[0] = src0->f[0] - src1->f[0];
1077   dst->f[1] = src0->f[1] - src1->f[1];
1078   dst->f[2] = src0->f[2] - src1->f[2];
1079   dst->f[3] = src0->f[3] - src1->f[3];
1080}
1081
1082#if 0
1083static void
1084micro_u2f(
1085   union tgsi_exec_channel *dst,
1086   const union tgsi_exec_channel *src )
1087{
1088   dst->f[0] = (float) src->u[0];
1089   dst->f[1] = (float) src->u[1];
1090   dst->f[2] = (float) src->u[2];
1091   dst->f[3] = (float) src->u[3];
1092}
1093#endif
1094
1095static void
1096micro_xor(
1097   union tgsi_exec_channel *dst,
1098   const union tgsi_exec_channel *src0,
1099   const union tgsi_exec_channel *src1 )
1100{
1101   dst->u[0] = src0->u[0] ^ src1->u[0];
1102   dst->u[1] = src0->u[1] ^ src1->u[1];
1103   dst->u[2] = src0->u[2] ^ src1->u[2];
1104   dst->u[3] = src0->u[3] ^ src1->u[3];
1105}
1106
1107static void
1108fetch_src_file_channel(
1109   const struct tgsi_exec_machine *mach,
1110   const uint file,
1111   const uint swizzle,
1112   const union tgsi_exec_channel *index,
1113   union tgsi_exec_channel *chan )
1114{
1115   switch( swizzle ) {
1116   case TGSI_SWIZZLE_X:
1117   case TGSI_SWIZZLE_Y:
1118   case TGSI_SWIZZLE_Z:
1119   case TGSI_SWIZZLE_W:
1120      switch( file ) {
1121      case TGSI_FILE_CONSTANT:
1122         assert(mach->Consts);
1123         if (index->i[0] < 0)
1124            chan->f[0] = 0.0f;
1125         else
1126            chan->f[0] = mach->Consts[index->i[0]][swizzle];
1127         if (index->i[1] < 0)
1128            chan->f[1] = 0.0f;
1129         else
1130            chan->f[1] = mach->Consts[index->i[1]][swizzle];
1131         if (index->i[2] < 0)
1132            chan->f[2] = 0.0f;
1133         else
1134            chan->f[2] = mach->Consts[index->i[2]][swizzle];
1135         if (index->i[3] < 0)
1136            chan->f[3] = 0.0f;
1137         else
1138            chan->f[3] = mach->Consts[index->i[3]][swizzle];
1139         break;
1140
1141      case TGSI_FILE_INPUT:
1142         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
1143         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
1144         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
1145         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
1146         break;
1147
1148      case TGSI_FILE_TEMPORARY:
1149         assert(index->i[0] < TGSI_EXEC_NUM_TEMPS);
1150         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
1151         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
1152         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
1153         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
1154         break;
1155
1156      case TGSI_FILE_IMMEDIATE:
1157         assert( index->i[0] < (int) mach->ImmLimit );
1158         chan->f[0] = mach->Imms[index->i[0]][swizzle];
1159         assert( index->i[1] < (int) mach->ImmLimit );
1160         chan->f[1] = mach->Imms[index->i[1]][swizzle];
1161         assert( index->i[2] < (int) mach->ImmLimit );
1162         chan->f[2] = mach->Imms[index->i[2]][swizzle];
1163         assert( index->i[3] < (int) mach->ImmLimit );
1164         chan->f[3] = mach->Imms[index->i[3]][swizzle];
1165         break;
1166
1167      case TGSI_FILE_ADDRESS:
1168         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
1169         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
1170         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
1171         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
1172         break;
1173
1174      case TGSI_FILE_PREDICATE:
1175         assert(index->i[0] < TGSI_EXEC_NUM_PREDS);
1176         assert(index->i[1] < TGSI_EXEC_NUM_PREDS);
1177         assert(index->i[2] < TGSI_EXEC_NUM_PREDS);
1178         assert(index->i[3] < TGSI_EXEC_NUM_PREDS);
1179         chan->u[0] = mach->Addrs[0].xyzw[swizzle].u[0];
1180         chan->u[1] = mach->Addrs[0].xyzw[swizzle].u[1];
1181         chan->u[2] = mach->Addrs[0].xyzw[swizzle].u[2];
1182         chan->u[3] = mach->Addrs[0].xyzw[swizzle].u[3];
1183         break;
1184
1185      case TGSI_FILE_OUTPUT:
1186         /* vertex/fragment output vars can be read too */
1187         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
1188         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
1189         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
1190         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
1191         break;
1192
1193      default:
1194         assert( 0 );
1195      }
1196      break;
1197
1198   default:
1199      assert( 0 );
1200   }
1201}
1202
1203static void
1204fetch_source(
1205   const struct tgsi_exec_machine *mach,
1206   union tgsi_exec_channel *chan,
1207   const struct tgsi_full_src_register *reg,
1208   const uint chan_index )
1209{
1210   union tgsi_exec_channel index;
1211   uint swizzle;
1212
1213   /* We start with a direct index into a register file.
1214    *
1215    *    file[1],
1216    *    where:
1217    *       file = SrcRegister.File
1218    *       [1] = SrcRegister.Index
1219    */
1220   index.i[0] =
1221   index.i[1] =
1222   index.i[2] =
1223   index.i[3] = reg->SrcRegister.Index;
1224
1225   /* There is an extra source register that indirectly subscripts
1226    * a register file. The direct index now becomes an offset
1227    * that is being added to the indirect register.
1228    *
1229    *    file[ind[2].x+1],
1230    *    where:
1231    *       ind = SrcRegisterInd.File
1232    *       [2] = SrcRegisterInd.Index
1233    *       .x = SrcRegisterInd.SwizzleX
1234    */
1235   if (reg->SrcRegister.Indirect) {
1236      union tgsi_exec_channel index2;
1237      union tgsi_exec_channel indir_index;
1238      const uint execmask = mach->ExecMask;
1239      uint i;
1240
1241      /* which address register (always zero now) */
1242      index2.i[0] =
1243      index2.i[1] =
1244      index2.i[2] =
1245      index2.i[3] = reg->SrcRegisterInd.Index;
1246
1247      /* get current value of address register[swizzle] */
1248      swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterInd, CHAN_X );
1249      fetch_src_file_channel(
1250         mach,
1251         reg->SrcRegisterInd.File,
1252         swizzle,
1253         &index2,
1254         &indir_index );
1255
1256      /* add value of address register to the offset */
1257      index.i[0] += (int) indir_index.f[0];
1258      index.i[1] += (int) indir_index.f[1];
1259      index.i[2] += (int) indir_index.f[2];
1260      index.i[3] += (int) indir_index.f[3];
1261
1262      /* for disabled execution channels, zero-out the index to
1263       * avoid using a potential garbage value.
1264       */
1265      for (i = 0; i < QUAD_SIZE; i++) {
1266         if ((execmask & (1 << i)) == 0)
1267            index.i[i] = 0;
1268      }
1269   }
1270
1271   /* There is an extra source register that is a second
1272    * subscript to a register file. Effectively it means that
1273    * the register file is actually a 2D array of registers.
1274    *
1275    *    file[1][3] == file[1*sizeof(file[1])+3],
1276    *    where:
1277    *       [3] = SrcRegisterDim.Index
1278    */
1279   if (reg->SrcRegister.Dimension) {
1280      /* The size of the first-order array depends on the register file type.
1281       * We need to multiply the index to the first array to get an effective,
1282       * "flat" index that points to the beginning of the second-order array.
1283       */
1284      switch (reg->SrcRegister.File) {
1285      case TGSI_FILE_INPUT:
1286         index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1287         index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1288         index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1289         index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1290         break;
1291      case TGSI_FILE_CONSTANT:
1292         index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
1293         index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
1294         index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
1295         index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
1296         break;
1297      default:
1298         assert( 0 );
1299      }
1300
1301      index.i[0] += reg->SrcRegisterDim.Index;
1302      index.i[1] += reg->SrcRegisterDim.Index;
1303      index.i[2] += reg->SrcRegisterDim.Index;
1304      index.i[3] += reg->SrcRegisterDim.Index;
1305
1306      /* Again, the second subscript index can be addressed indirectly
1307       * identically to the first one.
1308       * Nothing stops us from indirectly addressing the indirect register,
1309       * but there is no need for that, so we won't exercise it.
1310       *
1311       *    file[1][ind[4].y+3],
1312       *    where:
1313       *       ind = SrcRegisterDimInd.File
1314       *       [4] = SrcRegisterDimInd.Index
1315       *       .y = SrcRegisterDimInd.SwizzleX
1316       */
1317      if (reg->SrcRegisterDim.Indirect) {
1318         union tgsi_exec_channel index2;
1319         union tgsi_exec_channel indir_index;
1320         const uint execmask = mach->ExecMask;
1321         uint i;
1322
1323         index2.i[0] =
1324         index2.i[1] =
1325         index2.i[2] =
1326         index2.i[3] = reg->SrcRegisterDimInd.Index;
1327
1328         swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
1329         fetch_src_file_channel(
1330            mach,
1331            reg->SrcRegisterDimInd.File,
1332            swizzle,
1333            &index2,
1334            &indir_index );
1335
1336         index.i[0] += (int) indir_index.f[0];
1337         index.i[1] += (int) indir_index.f[1];
1338         index.i[2] += (int) indir_index.f[2];
1339         index.i[3] += (int) indir_index.f[3];
1340
1341         /* for disabled execution channels, zero-out the index to
1342          * avoid using a potential garbage value.
1343          */
1344         for (i = 0; i < QUAD_SIZE; i++) {
1345            if ((execmask & (1 << i)) == 0)
1346               index.i[i] = 0;
1347         }
1348      }
1349
1350      /* If by any chance there was a need for a 3D array of register
1351       * files, we would have to check whether SrcRegisterDim is followed
1352       * by a dimension register and continue the saga.
1353       */
1354   }
1355
1356   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1357   fetch_src_file_channel(
1358      mach,
1359      reg->SrcRegister.File,
1360      swizzle,
1361      &index,
1362      chan );
1363
1364   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
1365   case TGSI_UTIL_SIGN_CLEAR:
1366      micro_abs( chan, chan );
1367      break;
1368
1369   case TGSI_UTIL_SIGN_SET:
1370      micro_abs( chan, chan );
1371      micro_neg( chan, chan );
1372      break;
1373
1374   case TGSI_UTIL_SIGN_TOGGLE:
1375      micro_neg( chan, chan );
1376      break;
1377
1378   case TGSI_UTIL_SIGN_KEEP:
1379      break;
1380   }
1381
1382   if (reg->SrcRegisterExtMod.Complement) {
1383      micro_sub( chan, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], chan );
1384   }
1385}
1386
1387static void
1388store_dest(
1389   struct tgsi_exec_machine *mach,
1390   const union tgsi_exec_channel *chan,
1391   const struct tgsi_full_dst_register *reg,
1392   const struct tgsi_full_instruction *inst,
1393   uint chan_index )
1394{
1395   uint i;
1396   union tgsi_exec_channel null;
1397   union tgsi_exec_channel *dst;
1398   uint execmask = mach->ExecMask;
1399   int offset = 0;  /* indirection offset */
1400   int index;
1401
1402#ifdef DEBUG
1403   check_inf_or_nan(chan);
1404#endif
1405
1406   /* There is an extra source register that indirectly subscripts
1407    * a register file. The direct index now becomes an offset
1408    * that is being added to the indirect register.
1409    *
1410    *    file[ind[2].x+1],
1411    *    where:
1412    *       ind = DstRegisterInd.File
1413    *       [2] = DstRegisterInd.Index
1414    *       .x = DstRegisterInd.SwizzleX
1415    */
1416   if (reg->DstRegister.Indirect) {
1417      union tgsi_exec_channel index;
1418      union tgsi_exec_channel indir_index;
1419      uint swizzle;
1420
1421      /* which address register (always zero for now) */
1422      index.i[0] =
1423      index.i[1] =
1424      index.i[2] =
1425      index.i[3] = reg->DstRegisterInd.Index;
1426
1427      /* get current value of address register[swizzle] */
1428      swizzle = tgsi_util_get_src_register_swizzle( &reg->DstRegisterInd, CHAN_X );
1429
1430      /* fetch values from the address/indirection register */
1431      fetch_src_file_channel(
1432         mach,
1433         reg->DstRegisterInd.File,
1434         swizzle,
1435         &index,
1436         &indir_index );
1437
1438      /* save indirection offset */
1439      offset = (int) indir_index.f[0];
1440   }
1441
1442   switch (reg->DstRegister.File) {
1443   case TGSI_FILE_NULL:
1444      dst = &null;
1445      break;
1446
1447   case TGSI_FILE_OUTPUT:
1448      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1449         + reg->DstRegister.Index;
1450      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1451      break;
1452
1453   case TGSI_FILE_TEMPORARY:
1454      index = reg->DstRegister.Index;
1455      assert( index < TGSI_EXEC_NUM_TEMPS );
1456      dst = &mach->Temps[offset + index].xyzw[chan_index];
1457      break;
1458
1459   case TGSI_FILE_ADDRESS:
1460      index = reg->DstRegister.Index;
1461      dst = &mach->Addrs[index].xyzw[chan_index];
1462      break;
1463
1464   case TGSI_FILE_PREDICATE:
1465      index = reg->DstRegister.Index;
1466      assert(index < TGSI_EXEC_NUM_PREDS);
1467      dst = &mach->Addrs[index].xyzw[chan_index];
1468      break;
1469
1470   default:
1471      assert( 0 );
1472      return;
1473   }
1474
1475   switch (inst->Instruction.Saturate) {
1476   case TGSI_SAT_NONE:
1477      for (i = 0; i < QUAD_SIZE; i++)
1478         if (execmask & (1 << i))
1479            dst->i[i] = chan->i[i];
1480      break;
1481
1482   case TGSI_SAT_ZERO_ONE:
1483      for (i = 0; i < QUAD_SIZE; i++)
1484         if (execmask & (1 << i)) {
1485            if (chan->f[i] < 0.0f)
1486               dst->f[i] = 0.0f;
1487            else if (chan->f[i] > 1.0f)
1488               dst->f[i] = 1.0f;
1489            else
1490               dst->i[i] = chan->i[i];
1491         }
1492      break;
1493
1494   case TGSI_SAT_MINUS_PLUS_ONE:
1495      for (i = 0; i < QUAD_SIZE; i++)
1496         if (execmask & (1 << i)) {
1497            if (chan->f[i] < -1.0f)
1498               dst->f[i] = -1.0f;
1499            else if (chan->f[i] > 1.0f)
1500               dst->f[i] = 1.0f;
1501            else
1502               dst->i[i] = chan->i[i];
1503         }
1504      break;
1505
1506   default:
1507      assert( 0 );
1508   }
1509}
1510
1511#define FETCH(VAL,INDEX,CHAN)\
1512    fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
1513
1514#define STORE(VAL,INDEX,CHAN)\
1515    store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
1516
1517
1518/**
1519 * Execute ARB-style KIL which is predicated by a src register.
1520 * Kill fragment if any of the four values is less than zero.
1521 */
1522static void
1523exec_kil(struct tgsi_exec_machine *mach,
1524         const struct tgsi_full_instruction *inst)
1525{
1526   uint uniquemask;
1527   uint chan_index;
1528   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1529   union tgsi_exec_channel r[1];
1530
1531   /* This mask stores component bits that were already tested. */
1532   uniquemask = 0;
1533
1534   for (chan_index = 0; chan_index < 4; chan_index++)
1535   {
1536      uint swizzle;
1537      uint i;
1538
1539      /* unswizzle channel */
1540      swizzle = tgsi_util_get_full_src_register_swizzle (
1541                        &inst->FullSrcRegisters[0],
1542                        chan_index);
1543
1544      /* check if the component has not been already tested */
1545      if (uniquemask & (1 << swizzle))
1546         continue;
1547      uniquemask |= 1 << swizzle;
1548
1549      FETCH(&r[0], 0, chan_index);
1550      for (i = 0; i < 4; i++)
1551         if (r[0].f[i] < 0.0f)
1552            kilmask |= 1 << i;
1553   }
1554
1555   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1556}
1557
1558/**
1559 * Execute NVIDIA-style KIL which is predicated by a condition code.
1560 * Kill fragment if the condition code is TRUE.
1561 */
1562static void
1563exec_kilp(struct tgsi_exec_machine *mach,
1564          const struct tgsi_full_instruction *inst)
1565{
1566   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1567
1568   /* "unconditional" kil */
1569   kilmask = mach->ExecMask;
1570   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1571}
1572
1573
1574/*
1575 * Fetch a four texture samples using STR texture coordinates.
1576 */
1577static void
1578fetch_texel( struct tgsi_sampler *sampler,
1579             const union tgsi_exec_channel *s,
1580             const union tgsi_exec_channel *t,
1581             const union tgsi_exec_channel *p,
1582             float lodbias,  /* XXX should be float[4] */
1583             union tgsi_exec_channel *r,
1584             union tgsi_exec_channel *g,
1585             union tgsi_exec_channel *b,
1586             union tgsi_exec_channel *a )
1587{
1588   uint j;
1589   float rgba[NUM_CHANNELS][QUAD_SIZE];
1590
1591   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
1592
1593   for (j = 0; j < 4; j++) {
1594      r->f[j] = rgba[0][j];
1595      g->f[j] = rgba[1][j];
1596      b->f[j] = rgba[2][j];
1597      a->f[j] = rgba[3][j];
1598   }
1599}
1600
1601
1602static void
1603exec_tex(struct tgsi_exec_machine *mach,
1604         const struct tgsi_full_instruction *inst,
1605         boolean biasLod,
1606         boolean projected)
1607{
1608   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
1609   union tgsi_exec_channel r[4];
1610   uint chan_index;
1611   float lodBias;
1612
1613   /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
1614
1615   switch (inst->InstructionExtTexture.Texture) {
1616   case TGSI_TEXTURE_1D:
1617   case TGSI_TEXTURE_SHADOW1D:
1618
1619      FETCH(&r[0], 0, CHAN_X);
1620
1621      if (projected) {
1622         FETCH(&r[1], 0, CHAN_W);
1623         micro_div( &r[0], &r[0], &r[1] );
1624      }
1625
1626      if (biasLod) {
1627         FETCH(&r[1], 0, CHAN_W);
1628         lodBias = r[2].f[0];
1629      }
1630      else
1631         lodBias = 0.0;
1632
1633      fetch_texel(mach->Samplers[unit],
1634                  &r[0], &ZeroVec, &ZeroVec, lodBias,  /* S, T, P, BIAS */
1635                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
1636      break;
1637
1638   case TGSI_TEXTURE_2D:
1639   case TGSI_TEXTURE_RECT:
1640   case TGSI_TEXTURE_SHADOW2D:
1641   case TGSI_TEXTURE_SHADOWRECT:
1642
1643      FETCH(&r[0], 0, CHAN_X);
1644      FETCH(&r[1], 0, CHAN_Y);
1645      FETCH(&r[2], 0, CHAN_Z);
1646
1647      if (projected) {
1648         FETCH(&r[3], 0, CHAN_W);
1649         micro_div( &r[0], &r[0], &r[3] );
1650         micro_div( &r[1], &r[1], &r[3] );
1651         micro_div( &r[2], &r[2], &r[3] );
1652      }
1653
1654      if (biasLod) {
1655         FETCH(&r[3], 0, CHAN_W);
1656         lodBias = r[3].f[0];
1657      }
1658      else
1659         lodBias = 0.0;
1660
1661      fetch_texel(mach->Samplers[unit],
1662                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
1663                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1664      break;
1665
1666   case TGSI_TEXTURE_3D:
1667   case TGSI_TEXTURE_CUBE:
1668
1669      FETCH(&r[0], 0, CHAN_X);
1670      FETCH(&r[1], 0, CHAN_Y);
1671      FETCH(&r[2], 0, CHAN_Z);
1672
1673      if (projected) {
1674         FETCH(&r[3], 0, CHAN_W);
1675         micro_div( &r[0], &r[0], &r[3] );
1676         micro_div( &r[1], &r[1], &r[3] );
1677         micro_div( &r[2], &r[2], &r[3] );
1678      }
1679
1680      if (biasLod) {
1681         FETCH(&r[3], 0, CHAN_W);
1682         lodBias = r[3].f[0];
1683      }
1684      else
1685         lodBias = 0.0;
1686
1687      fetch_texel(mach->Samplers[unit],
1688                  &r[0], &r[1], &r[2], lodBias,
1689                  &r[0], &r[1], &r[2], &r[3]);
1690      break;
1691
1692   default:
1693      assert (0);
1694   }
1695
1696   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1697      STORE( &r[chan_index], 0, chan_index );
1698   }
1699}
1700
1701static void
1702exec_txd(struct tgsi_exec_machine *mach,
1703         const struct tgsi_full_instruction *inst)
1704{
1705   const uint unit = inst->FullSrcRegisters[3].SrcRegister.Index;
1706   union tgsi_exec_channel r[4];
1707   uint chan_index;
1708
1709   /*
1710    * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1711    */
1712
1713   switch (inst->InstructionExtTexture.Texture) {
1714   case TGSI_TEXTURE_1D:
1715   case TGSI_TEXTURE_SHADOW1D:
1716
1717      FETCH(&r[0], 0, CHAN_X);
1718
1719      fetch_texel(mach->Samplers[unit],
1720                  &r[0], &ZeroVec, &ZeroVec, 0.0f,  /* S, T, P, BIAS */
1721                  &r[0], &r[1], &r[2], &r[3]);      /* R, G, B, A */
1722      break;
1723
1724   case TGSI_TEXTURE_2D:
1725   case TGSI_TEXTURE_RECT:
1726   case TGSI_TEXTURE_SHADOW2D:
1727   case TGSI_TEXTURE_SHADOWRECT:
1728
1729      FETCH(&r[0], 0, CHAN_X);
1730      FETCH(&r[1], 0, CHAN_Y);
1731      FETCH(&r[2], 0, CHAN_Z);
1732
1733      fetch_texel(mach->Samplers[unit],
1734                  &r[0], &r[1], &r[2], 0.0f,    /* inputs */
1735                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1736      break;
1737
1738   case TGSI_TEXTURE_3D:
1739   case TGSI_TEXTURE_CUBE:
1740
1741      FETCH(&r[0], 0, CHAN_X);
1742      FETCH(&r[1], 0, CHAN_Y);
1743      FETCH(&r[2], 0, CHAN_Z);
1744
1745      fetch_texel(mach->Samplers[unit],
1746                  &r[0], &r[1], &r[2], 0.0f,
1747                  &r[0], &r[1], &r[2], &r[3]);
1748      break;
1749
1750   default:
1751      assert(0);
1752   }
1753
1754   FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1755      STORE(&r[chan_index], 0, chan_index);
1756   }
1757}
1758
1759
1760/**
1761 * Evaluate a constant-valued coefficient at the position of the
1762 * current quad.
1763 */
1764static void
1765eval_constant_coef(
1766   struct tgsi_exec_machine *mach,
1767   unsigned attrib,
1768   unsigned chan )
1769{
1770   unsigned i;
1771
1772   for( i = 0; i < QUAD_SIZE; i++ ) {
1773      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1774   }
1775}
1776
1777/**
1778 * Evaluate a linear-valued coefficient at the position of the
1779 * current quad.
1780 */
1781static void
1782eval_linear_coef(
1783   struct tgsi_exec_machine *mach,
1784   unsigned attrib,
1785   unsigned chan )
1786{
1787   const float x = mach->QuadPos.xyzw[0].f[0];
1788   const float y = mach->QuadPos.xyzw[1].f[0];
1789   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1790   const float dady = mach->InterpCoefs[attrib].dady[chan];
1791   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1792   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1793   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1794   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1795   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1796}
1797
1798/**
1799 * Evaluate a perspective-valued coefficient at the position of the
1800 * current quad.
1801 */
1802static void
1803eval_perspective_coef(
1804   struct tgsi_exec_machine *mach,
1805   unsigned attrib,
1806   unsigned chan )
1807{
1808   const float x = mach->QuadPos.xyzw[0].f[0];
1809   const float y = mach->QuadPos.xyzw[1].f[0];
1810   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1811   const float dady = mach->InterpCoefs[attrib].dady[chan];
1812   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1813   const float *w = mach->QuadPos.xyzw[3].f;
1814   /* divide by W here */
1815   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1816   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1817   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1818   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1819}
1820
1821
1822typedef void (* eval_coef_func)(
1823   struct tgsi_exec_machine *mach,
1824   unsigned attrib,
1825   unsigned chan );
1826
1827static void
1828exec_declaration(
1829   struct tgsi_exec_machine *mach,
1830   const struct tgsi_full_declaration *decl )
1831{
1832   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
1833      if( decl->Declaration.File == TGSI_FILE_INPUT ) {
1834         unsigned first, last, mask;
1835         eval_coef_func eval;
1836
1837         first = decl->DeclarationRange.First;
1838         last = decl->DeclarationRange.Last;
1839         mask = decl->Declaration.UsageMask;
1840
1841         switch( decl->Declaration.Interpolate ) {
1842         case TGSI_INTERPOLATE_CONSTANT:
1843            eval = eval_constant_coef;
1844            break;
1845
1846         case TGSI_INTERPOLATE_LINEAR:
1847            eval = eval_linear_coef;
1848            break;
1849
1850         case TGSI_INTERPOLATE_PERSPECTIVE:
1851            eval = eval_perspective_coef;
1852            break;
1853
1854         default:
1855            assert( 0 );
1856            return;
1857         }
1858
1859         if( mask == TGSI_WRITEMASK_XYZW ) {
1860            unsigned i, j;
1861
1862            for( i = first; i <= last; i++ ) {
1863               for( j = 0; j < NUM_CHANNELS; j++ ) {
1864                  eval( mach, i, j );
1865               }
1866            }
1867         }
1868         else {
1869            unsigned i, j;
1870
1871            for( j = 0; j < NUM_CHANNELS; j++ ) {
1872               if( mask & (1 << j) ) {
1873                  for( i = first; i <= last; i++ ) {
1874                     eval( mach, i, j );
1875                  }
1876               }
1877            }
1878         }
1879      }
1880   }
1881}
1882
1883static void
1884exec_instruction(
1885   struct tgsi_exec_machine *mach,
1886   const struct tgsi_full_instruction *inst,
1887   int *pc )
1888{
1889   uint chan_index;
1890   union tgsi_exec_channel r[10];
1891   union tgsi_exec_channel d[8];
1892
1893   (*pc)++;
1894
1895   switch (inst->Instruction.Opcode) {
1896   case TGSI_OPCODE_ARL:
1897   case TGSI_OPCODE_FLR:
1898      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1899         FETCH( &r[0], 0, chan_index );
1900         micro_flr(&d[chan_index], &r[0]);
1901      }
1902      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1903         STORE(&d[chan_index], 0, chan_index);
1904      }
1905      break;
1906
1907   case TGSI_OPCODE_MOV:
1908      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1909         FETCH(&d[chan_index], 0, chan_index);
1910      }
1911      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1912         STORE(&d[chan_index], 0, chan_index);
1913      }
1914      break;
1915
1916   case TGSI_OPCODE_LIT:
1917      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1918         FETCH( &r[0], 0, CHAN_X );
1919         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1920            micro_max(&d[CHAN_Y], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
1921         }
1922
1923         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1924            FETCH( &r[1], 0, CHAN_Y );
1925            micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1926
1927            FETCH( &r[2], 0, CHAN_W );
1928            micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
1929            micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
1930            micro_pow( &r[1], &r[1], &r[2] );
1931            micro_lt(&d[CHAN_Z], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
1932         }
1933
1934         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
1935            STORE(&d[CHAN_Y], 0, CHAN_Y);
1936         }
1937         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
1938            STORE(&d[CHAN_Z], 0, CHAN_Z);
1939         }
1940      }
1941      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1942         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
1943      }
1944      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1945         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1946      }
1947      break;
1948
1949   case TGSI_OPCODE_RCP:
1950   /* TGSI_OPCODE_RECIP */
1951      FETCH( &r[0], 0, CHAN_X );
1952      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
1953      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1954         STORE( &r[0], 0, chan_index );
1955      }
1956      break;
1957
1958   case TGSI_OPCODE_RSQ:
1959   /* TGSI_OPCODE_RECIPSQRT */
1960      FETCH( &r[0], 0, CHAN_X );
1961      micro_abs( &r[0], &r[0] );
1962      micro_sqrt( &r[0], &r[0] );
1963      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
1964      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1965         STORE( &r[0], 0, chan_index );
1966      }
1967      break;
1968
1969   case TGSI_OPCODE_EXP:
1970      FETCH( &r[0], 0, CHAN_X );
1971      micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
1972      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1973         micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
1974         STORE( &r[2], 0, CHAN_X );        /* store r2 */
1975      }
1976      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1977         micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
1978         STORE( &r[2], 0, CHAN_Y );        /* store r2 */
1979      }
1980      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1981         micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
1982         STORE( &r[2], 0, CHAN_Z );        /* store r2 */
1983      }
1984      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1985         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1986      }
1987      break;
1988
1989   case TGSI_OPCODE_LOG:
1990      FETCH( &r[0], 0, CHAN_X );
1991      micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
1992      micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
1993      micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
1994      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1995         STORE( &r[0], 0, CHAN_X );
1996      }
1997      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1998         micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
1999         micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
2000         STORE( &r[0], 0, CHAN_Y );
2001      }
2002      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2003         STORE( &r[1], 0, CHAN_Z );
2004      }
2005      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2006         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2007      }
2008      break;
2009
2010   case TGSI_OPCODE_MUL:
2011      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2012         FETCH(&r[0], 0, chan_index);
2013         FETCH(&r[1], 1, chan_index);
2014         micro_mul(&d[chan_index], &r[0], &r[1]);
2015      }
2016      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2017         STORE(&d[chan_index], 0, chan_index);
2018      }
2019      break;
2020
2021   case TGSI_OPCODE_ADD:
2022      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2023         FETCH( &r[0], 0, chan_index );
2024         FETCH( &r[1], 1, chan_index );
2025         micro_add(&d[chan_index], &r[0], &r[1]);
2026      }
2027      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2028         STORE(&d[chan_index], 0, chan_index);
2029      }
2030      break;
2031
2032   case TGSI_OPCODE_DP3:
2033   /* TGSI_OPCODE_DOT3 */
2034      FETCH( &r[0], 0, CHAN_X );
2035      FETCH( &r[1], 1, CHAN_X );
2036      micro_mul( &r[0], &r[0], &r[1] );
2037
2038      FETCH( &r[1], 0, CHAN_Y );
2039      FETCH( &r[2], 1, CHAN_Y );
2040      micro_mul( &r[1], &r[1], &r[2] );
2041      micro_add( &r[0], &r[0], &r[1] );
2042
2043      FETCH( &r[1], 0, CHAN_Z );
2044      FETCH( &r[2], 1, CHAN_Z );
2045      micro_mul( &r[1], &r[1], &r[2] );
2046      micro_add( &r[0], &r[0], &r[1] );
2047
2048      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2049         STORE( &r[0], 0, chan_index );
2050      }
2051      break;
2052
2053    case TGSI_OPCODE_DP4:
2054    /* TGSI_OPCODE_DOT4 */
2055       FETCH(&r[0], 0, CHAN_X);
2056       FETCH(&r[1], 1, CHAN_X);
2057
2058       micro_mul( &r[0], &r[0], &r[1] );
2059
2060       FETCH(&r[1], 0, CHAN_Y);
2061       FETCH(&r[2], 1, CHAN_Y);
2062
2063       micro_mul( &r[1], &r[1], &r[2] );
2064       micro_add( &r[0], &r[0], &r[1] );
2065
2066       FETCH(&r[1], 0, CHAN_Z);
2067       FETCH(&r[2], 1, CHAN_Z);
2068
2069       micro_mul( &r[1], &r[1], &r[2] );
2070       micro_add( &r[0], &r[0], &r[1] );
2071
2072       FETCH(&r[1], 0, CHAN_W);
2073       FETCH(&r[2], 1, CHAN_W);
2074
2075       micro_mul( &r[1], &r[1], &r[2] );
2076       micro_add( &r[0], &r[0], &r[1] );
2077
2078      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2079         STORE( &r[0], 0, chan_index );
2080      }
2081      break;
2082
2083   case TGSI_OPCODE_DST:
2084      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2085         FETCH( &r[0], 0, CHAN_Y );
2086         FETCH( &r[1], 1, CHAN_Y);
2087         micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2088      }
2089      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2090         FETCH(&d[CHAN_Z], 0, CHAN_Z);
2091      }
2092      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2093         FETCH(&d[CHAN_W], 1, CHAN_W);
2094      }
2095
2096      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2097         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X);
2098      }
2099      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2100         STORE(&d[CHAN_Y], 0, CHAN_Y);
2101      }
2102      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2103         STORE(&d[CHAN_Z], 0, CHAN_Z);
2104      }
2105      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2106         STORE(&d[CHAN_W], 0, CHAN_W);
2107      }
2108      break;
2109
2110   case TGSI_OPCODE_MIN:
2111      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2112         FETCH(&r[0], 0, chan_index);
2113         FETCH(&r[1], 1, chan_index);
2114
2115         /* XXX use micro_min()?? */
2116         micro_lt(&d[chan_index], &r[0], &r[1], &r[0], &r[1]);
2117      }
2118      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2119         STORE(&d[chan_index], 0, chan_index);
2120      }
2121      break;
2122
2123   case TGSI_OPCODE_MAX:
2124      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2125         FETCH(&r[0], 0, chan_index);
2126         FETCH(&r[1], 1, chan_index);
2127
2128         /* XXX use micro_max()?? */
2129         micro_lt(&d[chan_index], &r[0], &r[1], &r[1], &r[0] );
2130      }
2131      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2132         STORE(&d[chan_index], 0, chan_index);
2133      }
2134      break;
2135
2136   case TGSI_OPCODE_SLT:
2137   /* TGSI_OPCODE_SETLT */
2138      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2139         FETCH( &r[0], 0, chan_index );
2140         FETCH( &r[1], 1, chan_index );
2141         micro_lt(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2142      }
2143      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2144         STORE(&d[chan_index], 0, chan_index);
2145      }
2146      break;
2147
2148   case TGSI_OPCODE_SGE:
2149   /* TGSI_OPCODE_SETGE */
2150      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2151         FETCH( &r[0], 0, chan_index );
2152         FETCH( &r[1], 1, chan_index );
2153         micro_le(&d[chan_index], &r[1], &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2154      }
2155      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2156         STORE(&d[chan_index], 0, chan_index);
2157      }
2158      break;
2159
2160   case TGSI_OPCODE_MAD:
2161   /* TGSI_OPCODE_MADD */
2162      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2163         FETCH( &r[0], 0, chan_index );
2164         FETCH( &r[1], 1, chan_index );
2165         micro_mul( &r[0], &r[0], &r[1] );
2166         FETCH( &r[1], 2, chan_index );
2167         micro_add(&d[chan_index], &r[0], &r[1]);
2168      }
2169      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2170         STORE(&d[chan_index], 0, chan_index);
2171      }
2172      break;
2173
2174   case TGSI_OPCODE_SUB:
2175      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2176         FETCH(&r[0], 0, chan_index);
2177         FETCH(&r[1], 1, chan_index);
2178         micro_sub(&d[chan_index], &r[0], &r[1]);
2179      }
2180      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2181         STORE(&d[chan_index], 0, chan_index);
2182      }
2183      break;
2184
2185   case TGSI_OPCODE_LRP:
2186      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2187         FETCH(&r[0], 0, chan_index);
2188         FETCH(&r[1], 1, chan_index);
2189         FETCH(&r[2], 2, chan_index);
2190         micro_sub( &r[1], &r[1], &r[2] );
2191         micro_mul( &r[0], &r[0], &r[1] );
2192         micro_add(&d[chan_index], &r[0], &r[2]);
2193      }
2194      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2195         STORE(&d[chan_index], 0, chan_index);
2196      }
2197      break;
2198
2199   case TGSI_OPCODE_CND:
2200      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2201         FETCH(&r[0], 0, chan_index);
2202         FETCH(&r[1], 1, chan_index);
2203         FETCH(&r[2], 2, chan_index);
2204         micro_lt(&d[chan_index], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]);
2205      }
2206      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2207         STORE(&d[chan_index], 0, chan_index);
2208      }
2209      break;
2210
2211   case TGSI_OPCODE_DP2A:
2212      FETCH( &r[0], 0, CHAN_X );
2213      FETCH( &r[1], 1, CHAN_X );
2214      micro_mul( &r[0], &r[0], &r[1] );
2215
2216      FETCH( &r[1], 0, CHAN_Y );
2217      FETCH( &r[2], 1, CHAN_Y );
2218      micro_mul( &r[1], &r[1], &r[2] );
2219      micro_add( &r[0], &r[0], &r[1] );
2220
2221      FETCH( &r[2], 2, CHAN_X );
2222      micro_add( &r[0], &r[0], &r[2] );
2223
2224      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2225         STORE( &r[0], 0, chan_index );
2226      }
2227      break;
2228
2229   case TGSI_OPCODE_FRC:
2230      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2231         FETCH( &r[0], 0, chan_index );
2232         micro_frc(&d[chan_index], &r[0]);
2233      }
2234      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2235         STORE(&d[chan_index], 0, chan_index);
2236      }
2237      break;
2238
2239   case TGSI_OPCODE_CLAMP:
2240      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2241         FETCH(&r[0], 0, chan_index);
2242         FETCH(&r[1], 1, chan_index);
2243         micro_max(&r[0], &r[0], &r[1]);
2244         FETCH(&r[1], 2, chan_index);
2245         micro_min(&d[chan_index], &r[0], &r[1]);
2246      }
2247      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2248         STORE(&d[chan_index], 0, chan_index);
2249      }
2250      break;
2251
2252   case TGSI_OPCODE_ROUND:
2253   case TGSI_OPCODE_ARR:
2254      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2255         FETCH( &r[0], 0, chan_index );
2256         micro_rnd(&d[chan_index], &r[0]);
2257      }
2258      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2259         STORE(&d[chan_index], 0, chan_index);
2260      }
2261      break;
2262
2263   case TGSI_OPCODE_EX2:
2264      FETCH(&r[0], 0, CHAN_X);
2265
2266#if FAST_MATH
2267      micro_exp2( &r[0], &r[0] );
2268#else
2269      micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] );
2270#endif
2271
2272      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2273         STORE( &r[0], 0, chan_index );
2274      }
2275      break;
2276
2277   case TGSI_OPCODE_LG2:
2278      FETCH( &r[0], 0, CHAN_X );
2279      micro_lg2( &r[0], &r[0] );
2280      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2281         STORE( &r[0], 0, chan_index );
2282      }
2283      break;
2284
2285   case TGSI_OPCODE_POW:
2286      FETCH(&r[0], 0, CHAN_X);
2287      FETCH(&r[1], 1, CHAN_X);
2288
2289      micro_pow( &r[0], &r[0], &r[1] );
2290
2291      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2292         STORE( &r[0], 0, chan_index );
2293      }
2294      break;
2295
2296   case TGSI_OPCODE_XPD:
2297      FETCH(&r[0], 0, CHAN_Y);
2298      FETCH(&r[1], 1, CHAN_Z);
2299
2300      micro_mul( &r[2], &r[0], &r[1] );
2301
2302      FETCH(&r[3], 0, CHAN_Z);
2303      FETCH(&r[4], 1, CHAN_Y);
2304
2305      micro_mul( &r[5], &r[3], &r[4] );
2306      micro_sub(&d[CHAN_X], &r[2], &r[5]);
2307
2308      FETCH(&r[2], 1, CHAN_X);
2309
2310      micro_mul( &r[3], &r[3], &r[2] );
2311
2312      FETCH(&r[5], 0, CHAN_X);
2313
2314      micro_mul( &r[1], &r[1], &r[5] );
2315      micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2316
2317      micro_mul( &r[5], &r[5], &r[4] );
2318      micro_mul( &r[0], &r[0], &r[2] );
2319      micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2320
2321      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2322         STORE(&d[CHAN_X], 0, CHAN_X);
2323      }
2324      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2325         STORE(&d[CHAN_Y], 0, CHAN_Y);
2326      }
2327      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2328         STORE(&d[CHAN_Z], 0, CHAN_Z);
2329      }
2330      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2331         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2332      }
2333      break;
2334
2335    case TGSI_OPCODE_ABS:
2336       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2337          FETCH(&r[0], 0, chan_index);
2338          micro_abs(&d[chan_index], &r[0]);
2339       }
2340       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2341         STORE(&d[chan_index], 0, chan_index);
2342      }
2343       break;
2344
2345   case TGSI_OPCODE_RCC:
2346      FETCH(&r[0], 0, CHAN_X);
2347      micro_div(&r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0]);
2348      micro_float_clamp(&r[0], &r[0]);
2349      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2350         STORE(&r[0], 0, chan_index);
2351      }
2352      break;
2353
2354   case TGSI_OPCODE_DPH:
2355      FETCH(&r[0], 0, CHAN_X);
2356      FETCH(&r[1], 1, CHAN_X);
2357
2358      micro_mul( &r[0], &r[0], &r[1] );
2359
2360      FETCH(&r[1], 0, CHAN_Y);
2361      FETCH(&r[2], 1, CHAN_Y);
2362
2363      micro_mul( &r[1], &r[1], &r[2] );
2364      micro_add( &r[0], &r[0], &r[1] );
2365
2366      FETCH(&r[1], 0, CHAN_Z);
2367      FETCH(&r[2], 1, CHAN_Z);
2368
2369      micro_mul( &r[1], &r[1], &r[2] );
2370      micro_add( &r[0], &r[0], &r[1] );
2371
2372      FETCH(&r[1], 1, CHAN_W);
2373
2374      micro_add( &r[0], &r[0], &r[1] );
2375
2376      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2377         STORE( &r[0], 0, chan_index );
2378      }
2379      break;
2380
2381   case TGSI_OPCODE_COS:
2382      FETCH(&r[0], 0, CHAN_X);
2383
2384      micro_cos( &r[0], &r[0] );
2385
2386      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2387         STORE( &r[0], 0, chan_index );
2388      }
2389      break;
2390
2391   case TGSI_OPCODE_DDX:
2392      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2393         FETCH( &r[0], 0, chan_index );
2394         micro_ddx(&d[chan_index], &r[0]);
2395      }
2396      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2397         STORE(&d[chan_index], 0, chan_index);
2398      }
2399      break;
2400
2401   case TGSI_OPCODE_DDY:
2402      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2403         FETCH( &r[0], 0, chan_index );
2404         micro_ddy(&d[chan_index], &r[0]);
2405      }
2406      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2407         STORE(&d[chan_index], 0, chan_index);
2408      }
2409      break;
2410
2411   case TGSI_OPCODE_KILP:
2412      exec_kilp (mach, inst);
2413      break;
2414
2415   case TGSI_OPCODE_KIL:
2416      exec_kil (mach, inst);
2417      break;
2418
2419   case TGSI_OPCODE_PK2H:
2420      assert (0);
2421      break;
2422
2423   case TGSI_OPCODE_PK2US:
2424      assert (0);
2425      break;
2426
2427   case TGSI_OPCODE_PK4B:
2428      assert (0);
2429      break;
2430
2431   case TGSI_OPCODE_PK4UB:
2432      assert (0);
2433      break;
2434
2435   case TGSI_OPCODE_RFL:
2436      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2437          IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2438          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2439         /* r0 = dp3(src0, src0) */
2440         FETCH(&r[2], 0, CHAN_X);
2441         micro_mul(&r[0], &r[2], &r[2]);
2442         FETCH(&r[4], 0, CHAN_Y);
2443         micro_mul(&r[8], &r[4], &r[4]);
2444         micro_add(&r[0], &r[0], &r[8]);
2445         FETCH(&r[6], 0, CHAN_Z);
2446         micro_mul(&r[8], &r[6], &r[6]);
2447         micro_add(&r[0], &r[0], &r[8]);
2448
2449         /* r1 = dp3(src0, src1) */
2450         FETCH(&r[3], 1, CHAN_X);
2451         micro_mul(&r[1], &r[2], &r[3]);
2452         FETCH(&r[5], 1, CHAN_Y);
2453         micro_mul(&r[8], &r[4], &r[5]);
2454         micro_add(&r[1], &r[1], &r[8]);
2455         FETCH(&r[7], 1, CHAN_Z);
2456         micro_mul(&r[8], &r[6], &r[7]);
2457         micro_add(&r[1], &r[1], &r[8]);
2458
2459         /* r1 = 2 * r1 / r0 */
2460         micro_add(&r[1], &r[1], &r[1]);
2461         micro_div(&r[1], &r[1], &r[0]);
2462
2463         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2464            micro_mul(&r[2], &r[2], &r[1]);
2465            micro_sub(&r[2], &r[2], &r[3]);
2466            STORE(&r[2], 0, CHAN_X);
2467         }
2468         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2469            micro_mul(&r[4], &r[4], &r[1]);
2470            micro_sub(&r[4], &r[4], &r[5]);
2471            STORE(&r[4], 0, CHAN_Y);
2472         }
2473         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2474            micro_mul(&r[6], &r[6], &r[1]);
2475            micro_sub(&r[6], &r[6], &r[7]);
2476            STORE(&r[6], 0, CHAN_Z);
2477         }
2478      }
2479      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2480         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2481      }
2482      break;
2483
2484   case TGSI_OPCODE_SEQ:
2485      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2486         FETCH( &r[0], 0, chan_index );
2487         FETCH( &r[1], 1, chan_index );
2488         micro_eq(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2489      }
2490      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2491         STORE(&d[chan_index], 0, chan_index);
2492      }
2493      break;
2494
2495   case TGSI_OPCODE_SFL:
2496      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2497         STORE(&mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, chan_index);
2498      }
2499      break;
2500
2501   case TGSI_OPCODE_SGT:
2502      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2503         FETCH( &r[0], 0, chan_index );
2504         FETCH( &r[1], 1, chan_index );
2505         micro_le(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
2506      }
2507      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2508         STORE(&d[chan_index], 0, chan_index);
2509      }
2510      break;
2511
2512   case TGSI_OPCODE_SIN:
2513      FETCH( &r[0], 0, CHAN_X );
2514      micro_sin( &r[0], &r[0] );
2515      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2516         STORE( &r[0], 0, chan_index );
2517      }
2518      break;
2519
2520   case TGSI_OPCODE_SLE:
2521      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2522         FETCH( &r[0], 0, chan_index );
2523         FETCH( &r[1], 1, chan_index );
2524         micro_le(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2525      }
2526      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2527         STORE(&d[chan_index], 0, chan_index);
2528      }
2529      break;
2530
2531   case TGSI_OPCODE_SNE:
2532      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2533         FETCH( &r[0], 0, chan_index );
2534         FETCH( &r[1], 1, chan_index );
2535         micro_eq(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
2536      }
2537      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2538         STORE(&d[chan_index], 0, chan_index);
2539      }
2540      break;
2541
2542   case TGSI_OPCODE_STR:
2543      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2544         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, chan_index);
2545      }
2546      break;
2547
2548   case TGSI_OPCODE_TEX:
2549      /* simple texture lookup */
2550      /* src[0] = texcoord */
2551      /* src[1] = sampler unit */
2552      exec_tex(mach, inst, FALSE, FALSE);
2553      break;
2554
2555   case TGSI_OPCODE_TXB:
2556      /* Texture lookup with lod bias */
2557      /* src[0] = texcoord (src[0].w = LOD bias) */
2558      /* src[1] = sampler unit */
2559      exec_tex(mach, inst, TRUE, FALSE);
2560      break;
2561
2562   case TGSI_OPCODE_TXD:
2563      /* Texture lookup with explict partial derivatives */
2564      /* src[0] = texcoord */
2565      /* src[1] = d[strq]/dx */
2566      /* src[2] = d[strq]/dy */
2567      /* src[3] = sampler unit */
2568      exec_txd(mach, inst);
2569      break;
2570
2571   case TGSI_OPCODE_TXL:
2572      /* Texture lookup with explit LOD */
2573      /* src[0] = texcoord (src[0].w = LOD) */
2574      /* src[1] = sampler unit */
2575      exec_tex(mach, inst, TRUE, FALSE);
2576      break;
2577
2578   case TGSI_OPCODE_TXP:
2579      /* Texture lookup with projection */
2580      /* src[0] = texcoord (src[0].w = projection) */
2581      /* src[1] = sampler unit */
2582      exec_tex(mach, inst, FALSE, TRUE);
2583      break;
2584
2585   case TGSI_OPCODE_UP2H:
2586      assert (0);
2587      break;
2588
2589   case TGSI_OPCODE_UP2US:
2590      assert (0);
2591      break;
2592
2593   case TGSI_OPCODE_UP4B:
2594      assert (0);
2595      break;
2596
2597   case TGSI_OPCODE_UP4UB:
2598      assert (0);
2599      break;
2600
2601   case TGSI_OPCODE_X2D:
2602      FETCH(&r[0], 1, CHAN_X);
2603      FETCH(&r[1], 1, CHAN_Y);
2604      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2605          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2606         FETCH(&r[2], 2, CHAN_X);
2607         micro_mul(&r[2], &r[2], &r[0]);
2608         FETCH(&r[3], 2, CHAN_Y);
2609         micro_mul(&r[3], &r[3], &r[1]);
2610         micro_add(&r[2], &r[2], &r[3]);
2611         FETCH(&r[3], 0, CHAN_X);
2612         micro_add(&d[CHAN_X], &r[2], &r[3]);
2613
2614      }
2615      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2616          IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2617         FETCH(&r[2], 2, CHAN_Z);
2618         micro_mul(&r[2], &r[2], &r[0]);
2619         FETCH(&r[3], 2, CHAN_W);
2620         micro_mul(&r[3], &r[3], &r[1]);
2621         micro_add(&r[2], &r[2], &r[3]);
2622         FETCH(&r[3], 0, CHAN_Y);
2623         micro_add(&d[CHAN_Y], &r[2], &r[3]);
2624
2625      }
2626      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2627         STORE(&d[CHAN_X], 0, CHAN_X);
2628      }
2629      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2630         STORE(&d[CHAN_Y], 0, CHAN_Y);
2631      }
2632      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2633         STORE(&d[CHAN_X], 0, CHAN_Z);
2634      }
2635      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2636         STORE(&d[CHAN_Y], 0, CHAN_W);
2637      }
2638      break;
2639
2640   case TGSI_OPCODE_ARA:
2641      assert (0);
2642      break;
2643
2644   case TGSI_OPCODE_BRA:
2645      assert (0);
2646      break;
2647
2648   case TGSI_OPCODE_CAL:
2649      /* skip the call if no execution channels are enabled */
2650      if (mach->ExecMask) {
2651         /* do the call */
2652
2653         /* First, record the depths of the execution stacks.
2654          * This is important for deeply nested/looped return statements.
2655          * We have to unwind the stacks by the correct amount.  For a
2656          * real code generator, we could determine the number of entries
2657          * to pop off each stack with simple static analysis and avoid
2658          * implementing this data structure at run time.
2659          */
2660         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
2661         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
2662         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
2663         /* note that PC was already incremented above */
2664         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
2665
2666         mach->CallStackTop++;
2667
2668         /* Second, push the Cond, Loop, Cont, Func stacks */
2669         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2670         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2671         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2672         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2673         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2674         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2675         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
2676         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
2677
2678         /* Finally, jump to the subroutine */
2679         *pc = inst->InstructionExtLabel.Label;
2680      }
2681      break;
2682
2683   case TGSI_OPCODE_RET:
2684      mach->FuncMask &= ~mach->ExecMask;
2685      UPDATE_EXEC_MASK(mach);
2686
2687      if (mach->FuncMask == 0x0) {
2688         /* really return now (otherwise, keep executing */
2689
2690         if (mach->CallStackTop == 0) {
2691            /* returning from main() */
2692            *pc = -1;
2693            return;
2694         }
2695
2696         assert(mach->CallStackTop > 0);
2697         mach->CallStackTop--;
2698
2699         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
2700         mach->CondMask = mach->CondStack[mach->CondStackTop];
2701
2702         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
2703         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
2704
2705         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
2706         mach->ContMask = mach->ContStack[mach->ContStackTop];
2707
2708         assert(mach->FuncStackTop > 0);
2709         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
2710
2711         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
2712
2713         UPDATE_EXEC_MASK(mach);
2714      }
2715      break;
2716
2717   case TGSI_OPCODE_SSG:
2718   /* TGSI_OPCODE_SGN */
2719      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2720         FETCH( &r[0], 0, chan_index );
2721         micro_sgn(&d[chan_index], &r[0]);
2722      }
2723      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2724         STORE(&d[chan_index], 0, chan_index);
2725      }
2726      break;
2727
2728   case TGSI_OPCODE_CMP:
2729      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2730         FETCH(&r[0], 0, chan_index);
2731         FETCH(&r[1], 1, chan_index);
2732         FETCH(&r[2], 2, chan_index);
2733         micro_lt(&d[chan_index], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2]);
2734      }
2735      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2736         STORE(&d[chan_index], 0, chan_index);
2737      }
2738      break;
2739
2740   case TGSI_OPCODE_SCS:
2741      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2742         FETCH( &r[0], 0, CHAN_X );
2743         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2744            micro_cos(&r[1], &r[0]);
2745            STORE(&r[1], 0, CHAN_X);
2746         }
2747         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2748            micro_sin(&r[1], &r[0]);
2749            STORE(&r[1], 0, CHAN_Y);
2750         }
2751      }
2752      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2753         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
2754      }
2755      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
2756         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2757      }
2758      break;
2759
2760   case TGSI_OPCODE_NRM:
2761      /* 3-component vector normalize */
2762      if(IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2763         IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2764         IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2765         /* r3 = sqrt(dp3(src0, src0)) */
2766         FETCH(&r[0], 0, CHAN_X);
2767         micro_mul(&r[3], &r[0], &r[0]);
2768         FETCH(&r[1], 0, CHAN_Y);
2769         micro_mul(&r[4], &r[1], &r[1]);
2770         micro_add(&r[3], &r[3], &r[4]);
2771         FETCH(&r[2], 0, CHAN_Z);
2772         micro_mul(&r[4], &r[2], &r[2]);
2773         micro_add(&r[3], &r[3], &r[4]);
2774         micro_sqrt(&r[3], &r[3]);
2775
2776         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2777            micro_div(&r[0], &r[0], &r[3]);
2778            STORE(&r[0], 0, CHAN_X);
2779         }
2780         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2781            micro_div(&r[1], &r[1], &r[3]);
2782            STORE(&r[1], 0, CHAN_Y);
2783         }
2784         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2785            micro_div(&r[2], &r[2], &r[3]);
2786            STORE(&r[2], 0, CHAN_Z);
2787         }
2788      }
2789      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2790         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2791      }
2792      break;
2793
2794   case TGSI_OPCODE_NRM4:
2795      /* 4-component vector normalize */
2796      {
2797         union tgsi_exec_channel tmp, dot;
2798
2799         /* tmp = dp4(src0, src0): */
2800         FETCH( &r[0], 0, CHAN_X );
2801         micro_mul( &tmp, &r[0], &r[0] );
2802
2803         FETCH( &r[1], 0, CHAN_Y );
2804         micro_mul( &dot, &r[1], &r[1] );
2805         micro_add( &tmp, &tmp, &dot );
2806
2807         FETCH( &r[2], 0, CHAN_Z );
2808         micro_mul( &dot, &r[2], &r[2] );
2809         micro_add( &tmp, &tmp, &dot );
2810
2811         FETCH( &r[3], 0, CHAN_W );
2812         micro_mul( &dot, &r[3], &r[3] );
2813         micro_add( &tmp, &tmp, &dot );
2814
2815         /* tmp = 1 / sqrt(tmp) */
2816         micro_sqrt( &tmp, &tmp );
2817         micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
2818
2819         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2820            /* chan = chan * tmp */
2821            micro_mul( &r[chan_index], &tmp, &r[chan_index] );
2822            STORE( &r[chan_index], 0, chan_index );
2823         }
2824      }
2825      break;
2826
2827   case TGSI_OPCODE_DIV:
2828      assert( 0 );
2829      break;
2830
2831   case TGSI_OPCODE_DP2:
2832      FETCH( &r[0], 0, CHAN_X );
2833      FETCH( &r[1], 1, CHAN_X );
2834      micro_mul( &r[0], &r[0], &r[1] );
2835
2836      FETCH( &r[1], 0, CHAN_Y );
2837      FETCH( &r[2], 1, CHAN_Y );
2838      micro_mul( &r[1], &r[1], &r[2] );
2839      micro_add( &r[0], &r[0], &r[1] );
2840
2841      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2842         STORE( &r[0], 0, chan_index );
2843      }
2844      break;
2845
2846   case TGSI_OPCODE_IF:
2847      /* push CondMask */
2848      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2849      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2850      FETCH( &r[0], 0, CHAN_X );
2851      /* update CondMask */
2852      if( ! r[0].u[0] ) {
2853         mach->CondMask &= ~0x1;
2854      }
2855      if( ! r[0].u[1] ) {
2856         mach->CondMask &= ~0x2;
2857      }
2858      if( ! r[0].u[2] ) {
2859         mach->CondMask &= ~0x4;
2860      }
2861      if( ! r[0].u[3] ) {
2862         mach->CondMask &= ~0x8;
2863      }
2864      UPDATE_EXEC_MASK(mach);
2865      /* Todo: If CondMask==0, jump to ELSE */
2866      break;
2867
2868   case TGSI_OPCODE_ELSE:
2869      /* invert CondMask wrt previous mask */
2870      {
2871         uint prevMask;
2872         assert(mach->CondStackTop > 0);
2873         prevMask = mach->CondStack[mach->CondStackTop - 1];
2874         mach->CondMask = ~mach->CondMask & prevMask;
2875         UPDATE_EXEC_MASK(mach);
2876         /* Todo: If CondMask==0, jump to ENDIF */
2877      }
2878      break;
2879
2880   case TGSI_OPCODE_ENDIF:
2881      /* pop CondMask */
2882      assert(mach->CondStackTop > 0);
2883      mach->CondMask = mach->CondStack[--mach->CondStackTop];
2884      UPDATE_EXEC_MASK(mach);
2885      break;
2886
2887   case TGSI_OPCODE_END:
2888      /* halt execution */
2889      *pc = -1;
2890      break;
2891
2892   case TGSI_OPCODE_REP:
2893      assert (0);
2894      break;
2895
2896   case TGSI_OPCODE_ENDREP:
2897       assert (0);
2898       break;
2899
2900   case TGSI_OPCODE_PUSHA:
2901      assert (0);
2902      break;
2903
2904   case TGSI_OPCODE_POPA:
2905      assert (0);
2906      break;
2907
2908   case TGSI_OPCODE_CEIL:
2909      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2910         FETCH( &r[0], 0, chan_index );
2911         micro_ceil(&d[chan_index], &r[0]);
2912      }
2913      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2914         STORE(&d[chan_index], 0, chan_index);
2915      }
2916      break;
2917
2918   case TGSI_OPCODE_I2F:
2919      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2920         FETCH( &r[0], 0, chan_index );
2921         micro_i2f(&d[chan_index], &r[0]);
2922      }
2923      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2924         STORE(&d[chan_index], 0, chan_index);
2925      }
2926      break;
2927
2928   case TGSI_OPCODE_NOT:
2929      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2930         FETCH( &r[0], 0, chan_index );
2931         micro_not(&d[chan_index], &r[0]);
2932      }
2933      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2934         STORE(&d[chan_index], 0, chan_index);
2935      }
2936      break;
2937
2938   case TGSI_OPCODE_TRUNC:
2939      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2940         FETCH( &r[0], 0, chan_index );
2941         micro_trunc(&d[chan_index], &r[0]);
2942      }
2943      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2944         STORE(&d[chan_index], 0, chan_index);
2945      }
2946      break;
2947
2948   case TGSI_OPCODE_SHL:
2949      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2950         FETCH( &r[0], 0, chan_index );
2951         FETCH( &r[1], 1, chan_index );
2952         micro_shl(&d[chan_index], &r[0], &r[1]);
2953      }
2954      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2955         STORE(&d[chan_index], 0, chan_index);
2956      }
2957      break;
2958
2959   case TGSI_OPCODE_SHR:
2960      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2961         FETCH( &r[0], 0, chan_index );
2962         FETCH( &r[1], 1, chan_index );
2963         micro_ishr(&d[chan_index], &r[0], &r[1]);
2964      }
2965      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2966         STORE(&d[chan_index], 0, chan_index);
2967      }
2968      break;
2969
2970   case TGSI_OPCODE_AND:
2971      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2972         FETCH( &r[0], 0, chan_index );
2973         FETCH( &r[1], 1, chan_index );
2974         micro_and(&d[chan_index], &r[0], &r[1]);
2975      }
2976      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2977         STORE(&d[chan_index], 0, chan_index);
2978      }
2979      break;
2980
2981   case TGSI_OPCODE_OR:
2982      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2983         FETCH( &r[0], 0, chan_index );
2984         FETCH( &r[1], 1, chan_index );
2985         micro_or(&d[chan_index], &r[0], &r[1]);
2986      }
2987      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2988         STORE(&d[chan_index], 0, chan_index);
2989      }
2990      break;
2991
2992   case TGSI_OPCODE_MOD:
2993      assert (0);
2994      break;
2995
2996   case TGSI_OPCODE_XOR:
2997      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2998         FETCH( &r[0], 0, chan_index );
2999         FETCH( &r[1], 1, chan_index );
3000         micro_xor(&d[chan_index], &r[0], &r[1]);
3001      }
3002      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3003         STORE(&d[chan_index], 0, chan_index);
3004      }
3005      break;
3006
3007   case TGSI_OPCODE_SAD:
3008      assert (0);
3009      break;
3010
3011   case TGSI_OPCODE_TXF:
3012      assert (0);
3013      break;
3014
3015   case TGSI_OPCODE_TXQ:
3016      assert (0);
3017      break;
3018
3019   case TGSI_OPCODE_EMIT:
3020      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
3021      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
3022      break;
3023
3024   case TGSI_OPCODE_ENDPRIM:
3025      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
3026      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
3027      break;
3028
3029   case TGSI_OPCODE_BGNFOR:
3030      assert(mach->LoopCounterStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3031      for (chan_index = 0; chan_index < 3; chan_index++) {
3032         FETCH( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[chan_index], 0, chan_index );
3033      }
3034      STORE( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_Y], 0, CHAN_X );
3035      ++mach->LoopCounterStackTop;
3036      /* fall-through (for now) */
3037   case TGSI_OPCODE_BGNLOOP:
3038      /* push LoopMask and ContMasks */
3039      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3040      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3041      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3042      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3043      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3044      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3045      break;
3046
3047   case TGSI_OPCODE_ENDFOR:
3048      assert(mach->LoopCounterStackTop > 0);
3049      micro_sub( &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3050                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3051                 &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
3052      /* update LoopMask */
3053      if( mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X].f[0] <= 0) {
3054         mach->LoopMask &= ~0x1;
3055      }
3056      if( mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X].f[1] <= 0 ) {
3057         mach->LoopMask &= ~0x2;
3058      }
3059      if( mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X].f[2] <= 0 ) {
3060         mach->LoopMask &= ~0x4;
3061      }
3062      if( mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X].f[3] <= 0 ) {
3063         mach->LoopMask &= ~0x8;
3064      }
3065      micro_add( &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3066                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3067                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Z]);
3068      assert(mach->LoopLabelStackTop > 0);
3069      inst = mach->Instructions + mach->LoopLabelStack[mach->LoopLabelStackTop - 1];
3070      STORE( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_Y], 0, CHAN_X );
3071      /* Restore ContMask, but don't pop */
3072      assert(mach->ContStackTop > 0);
3073      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3074      UPDATE_EXEC_MASK(mach);
3075      if (mach->ExecMask) {
3076         /* repeat loop: jump to instruction just past BGNLOOP */
3077         assert(mach->LoopLabelStackTop > 0);
3078         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3079      }
3080      else {
3081         /* exit loop: pop LoopMask */
3082         assert(mach->LoopStackTop > 0);
3083         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3084         /* pop ContMask */
3085         assert(mach->ContStackTop > 0);
3086         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3087         assert(mach->LoopLabelStackTop > 0);
3088         --mach->LoopLabelStackTop;
3089         assert(mach->LoopCounterStackTop > 0);
3090         --mach->LoopCounterStackTop;
3091      }
3092      UPDATE_EXEC_MASK(mach);
3093      break;
3094
3095   case TGSI_OPCODE_ENDLOOP:
3096      /* Restore ContMask, but don't pop */
3097      assert(mach->ContStackTop > 0);
3098      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3099      UPDATE_EXEC_MASK(mach);
3100      if (mach->ExecMask) {
3101         /* repeat loop: jump to instruction just past BGNLOOP */
3102         assert(mach->LoopLabelStackTop > 0);
3103         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3104      }
3105      else {
3106         /* exit loop: pop LoopMask */
3107         assert(mach->LoopStackTop > 0);
3108         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3109         /* pop ContMask */
3110         assert(mach->ContStackTop > 0);
3111         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3112         assert(mach->LoopLabelStackTop > 0);
3113         --mach->LoopLabelStackTop;
3114      }
3115      UPDATE_EXEC_MASK(mach);
3116      break;
3117
3118   case TGSI_OPCODE_BRK:
3119      /* turn off loop channels for each enabled exec channel */
3120      mach->LoopMask &= ~mach->ExecMask;
3121      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3122      UPDATE_EXEC_MASK(mach);
3123      break;
3124
3125   case TGSI_OPCODE_CONT:
3126      /* turn off cont channels for each enabled exec channel */
3127      mach->ContMask &= ~mach->ExecMask;
3128      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3129      UPDATE_EXEC_MASK(mach);
3130      break;
3131
3132   case TGSI_OPCODE_BGNSUB:
3133      /* no-op */
3134      break;
3135
3136   case TGSI_OPCODE_ENDSUB:
3137      /* no-op */
3138      break;
3139
3140   case TGSI_OPCODE_NOP:
3141      break;
3142
3143   default:
3144      assert( 0 );
3145   }
3146}
3147
3148#define DEBUG_EXECUTION 0
3149
3150
3151/**
3152 * Run TGSI interpreter.
3153 * \return bitmask of "alive" quad components
3154 */
3155uint
3156tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
3157{
3158   uint i;
3159   int pc = 0;
3160
3161   mach->CondMask = 0xf;
3162   mach->LoopMask = 0xf;
3163   mach->ContMask = 0xf;
3164   mach->FuncMask = 0xf;
3165   mach->ExecMask = 0xf;
3166
3167   assert(mach->CondStackTop == 0);
3168   assert(mach->LoopStackTop == 0);
3169   assert(mach->ContStackTop == 0);
3170   assert(mach->CallStackTop == 0);
3171
3172   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
3173   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
3174
3175   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
3176      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
3177      mach->Primitives[0] = 0;
3178   }
3179
3180   for (i = 0; i < QUAD_SIZE; i++) {
3181      mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
3182         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
3183         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
3184         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
3185         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
3186   }
3187
3188   /* execute declarations (interpolants) */
3189   for (i = 0; i < mach->NumDeclarations; i++) {
3190      exec_declaration( mach, mach->Declarations+i );
3191   }
3192
3193   {
3194#if DEBUG_EXECUTION
3195      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
3196      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
3197      uint inst = 1;
3198
3199      memcpy(temps, mach->Temps, sizeof(temps));
3200      memcpy(outputs, mach->Outputs, sizeof(outputs));
3201#endif
3202
3203      /* execute instructions, until pc is set to -1 */
3204      while (pc != -1) {
3205
3206#if DEBUG_EXECUTION
3207         uint i;
3208
3209         tgsi_dump_instruction(&mach->Instructions[pc], inst++);
3210#endif
3211
3212         assert(pc < (int) mach->NumInstructions);
3213         exec_instruction(mach, mach->Instructions + pc, &pc);
3214
3215#if DEBUG_EXECUTION
3216         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
3217            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
3218               uint j;
3219
3220               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
3221               debug_printf("TEMP[%2u] = ", i);
3222               for (j = 0; j < 4; j++) {
3223                  if (j > 0) {
3224                     debug_printf("           ");
3225                  }
3226                  debug_printf("(%6f, %6f, %6f, %6f)\n",
3227                               temps[i].xyzw[0].f[j],
3228                               temps[i].xyzw[1].f[j],
3229                               temps[i].xyzw[2].f[j],
3230                               temps[i].xyzw[3].f[j]);
3231               }
3232            }
3233         }
3234         for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
3235            if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
3236               uint j;
3237
3238               memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
3239               debug_printf("OUT[%2u] =  ", i);
3240               for (j = 0; j < 4; j++) {
3241                  if (j > 0) {
3242                     debug_printf("           ");
3243                  }
3244                  debug_printf("{%6f, %6f, %6f, %6f}\n",
3245                               outputs[i].xyzw[0].f[j],
3246                               outputs[i].xyzw[1].f[j],
3247                               outputs[i].xyzw[2].f[j],
3248                               outputs[i].xyzw[3].f[j]);
3249               }
3250            }
3251         }
3252#endif
3253      }
3254   }
3255
3256#if 0
3257   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
3258   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
3259      /*
3260       * Scale back depth component.
3261       */
3262      for (i = 0; i < 4; i++)
3263         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
3264   }
3265#endif
3266
3267   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3268}
3269