tgsi_exec.c revision 4e1d51786e0657c7430d731ac464f2a73e32eddf
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28/**
29 * TGSI interpreter/executor.
30 *
31 * Flow control information:
32 *
33 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
34 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
35 * care since a condition may be true for some quad components but false
36 * for other components.
37 *
38 * We basically execute all statements (even if they're in the part of
39 * an IF/ELSE clause that's "not taken") and use a special mask to
40 * control writing to destination registers.  This is the ExecMask.
41 * See store_dest().
42 *
43 * The ExecMask is computed from three other masks (CondMask, LoopMask and
44 * ContMask) which are controlled by the flow control instructions (namely:
45 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
46 *
47 *
48 * Authors:
49 *   Michal Krol
50 *   Brian Paul
51 */
52
53#include "pipe/p_compiler.h"
54#include "pipe/p_state.h"
55#include "pipe/p_shader_tokens.h"
56#include "tgsi/tgsi_dump.h"
57#include "tgsi/tgsi_parse.h"
58#include "tgsi/tgsi_util.h"
59#include "tgsi_exec.h"
60#include "util/u_memory.h"
61#include "util/u_math.h"
62
63#define FAST_MATH 1
64
65/** for tgsi_full_instruction::Flags */
66#define SOA_DEPENDENCY_FLAG 0x1
67
68#define TILE_TOP_LEFT     0
69#define TILE_TOP_RIGHT    1
70#define TILE_BOTTOM_LEFT  2
71#define TILE_BOTTOM_RIGHT 3
72
73#define CHAN_X  0
74#define CHAN_Y  1
75#define CHAN_Z  2
76#define CHAN_W  3
77
78/*
79 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
80 */
81#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
82#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
83#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
84#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
85#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
86#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
87#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
88#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
89#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
90#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
91#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
92#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
93#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
94#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
95#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
96#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
97#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
98#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
99#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
100#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
101#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
102#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
103#define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
104#define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
105#define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
106#define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
107#define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
108#define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
109#define TEMP_R0            TGSI_EXEC_TEMP_R0
110
111#define IS_CHANNEL_ENABLED(INST, CHAN)\
112   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
113
114#define IS_CHANNEL_ENABLED2(INST, CHAN)\
115   ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
116
117#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
118   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
119      if (IS_CHANNEL_ENABLED( INST, CHAN ))
120
121#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
122   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
123      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
124
125
126/** The execution mask depends on the conditional mask and the loop mask */
127#define UPDATE_EXEC_MASK(MACH) \
128      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
129
130
131static const union tgsi_exec_channel ZeroVec =
132   { { 0.0, 0.0, 0.0, 0.0 } };
133
134
135#ifdef DEBUG
136static void
137check_inf_or_nan(const union tgsi_exec_channel *chan)
138{
139   assert(!util_is_inf_or_nan(chan->f[0]));
140   assert(!util_is_inf_or_nan(chan->f[1]));
141   assert(!util_is_inf_or_nan(chan->f[2]));
142   assert(!util_is_inf_or_nan(chan->f[3]));
143}
144#endif
145
146
147#ifdef DEBUG
148static void
149print_chan(const char *msg, const union tgsi_exec_channel *chan)
150{
151   debug_printf("%s = {%f, %f, %f, %f}\n",
152                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
153}
154#endif
155
156
157#ifdef DEBUG
158static void
159print_temp(const struct tgsi_exec_machine *mach, uint index)
160{
161   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
162   int i;
163   debug_printf("Temp[%u] =\n", index);
164   for (i = 0; i < 4; i++) {
165      debug_printf("  %c: { %f, %f, %f, %f }\n",
166                   "XYZW"[i],
167                   tmp->xyzw[i].f[0],
168                   tmp->xyzw[i].f[1],
169                   tmp->xyzw[i].f[2],
170                   tmp->xyzw[i].f[3]);
171   }
172}
173#endif
174
175
176/**
177 * Check if there's a potential src/dst register data dependency when
178 * using SOA execution.
179 * Example:
180 *   MOV T, T.yxwz;
181 * This would expand into:
182 *   MOV t0, t1;
183 *   MOV t1, t0;
184 *   MOV t2, t3;
185 *   MOV t3, t2;
186 * The second instruction will have the wrong value for t0 if executed as-is.
187 */
188boolean
189tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
190{
191   uint i, chan;
192
193   uint writemask = inst->FullDstRegisters[0].DstRegister.WriteMask;
194   if (writemask == TGSI_WRITEMASK_X ||
195       writemask == TGSI_WRITEMASK_Y ||
196       writemask == TGSI_WRITEMASK_Z ||
197       writemask == TGSI_WRITEMASK_W ||
198       writemask == TGSI_WRITEMASK_NONE) {
199      /* no chance of data dependency */
200      return FALSE;
201   }
202
203   /* loop over src regs */
204   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
205      if ((inst->FullSrcRegisters[i].SrcRegister.File ==
206           inst->FullDstRegisters[0].DstRegister.File) &&
207          (inst->FullSrcRegisters[i].SrcRegister.Index ==
208           inst->FullDstRegisters[0].DstRegister.Index)) {
209         /* loop over dest channels */
210         uint channelsWritten = 0x0;
211         FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
212            /* check if we're reading a channel that's been written */
213            uint swizzle = tgsi_util_get_full_src_register_extswizzle(&inst->FullSrcRegisters[i], chan);
214            if (swizzle <= TGSI_SWIZZLE_W &&
215                (channelsWritten & (1 << swizzle))) {
216               return TRUE;
217            }
218
219            channelsWritten |= (1 << chan);
220         }
221      }
222   }
223   return FALSE;
224}
225
226
227/**
228 * Initialize machine state by expanding tokens to full instructions,
229 * allocating temporary storage, setting up constants, etc.
230 * After this, we can call tgsi_exec_machine_run() many times.
231 */
232void
233tgsi_exec_machine_bind_shader(
234   struct tgsi_exec_machine *mach,
235   const struct tgsi_token *tokens,
236   uint numSamplers,
237   struct tgsi_sampler **samplers)
238{
239   uint k;
240   struct tgsi_parse_context parse;
241   struct tgsi_exec_labels *labels = &mach->Labels;
242   struct tgsi_full_instruction *instructions;
243   struct tgsi_full_declaration *declarations;
244   uint maxInstructions = 10, numInstructions = 0;
245   uint maxDeclarations = 10, numDeclarations = 0;
246   uint instno = 0;
247
248#if 0
249   tgsi_dump(tokens, 0);
250#endif
251
252   util_init_math();
253
254   mach->Tokens = tokens;
255   mach->Samplers = samplers;
256
257   k = tgsi_parse_init (&parse, mach->Tokens);
258   if (k != TGSI_PARSE_OK) {
259      debug_printf( "Problem parsing!\n" );
260      return;
261   }
262
263   mach->Processor = parse.FullHeader.Processor.Processor;
264   mach->ImmLimit = 0;
265   labels->count = 0;
266
267   declarations = (struct tgsi_full_declaration *)
268      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
269
270   if (!declarations) {
271      return;
272   }
273
274   instructions = (struct tgsi_full_instruction *)
275      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
276
277   if (!instructions) {
278      FREE( declarations );
279      return;
280   }
281
282   while( !tgsi_parse_end_of_tokens( &parse ) ) {
283      uint pointer = parse.Position;
284      uint i;
285
286      tgsi_parse_token( &parse );
287      switch( parse.FullToken.Token.Type ) {
288      case TGSI_TOKEN_TYPE_DECLARATION:
289         /* save expanded declaration */
290         if (numDeclarations == maxDeclarations) {
291            declarations = REALLOC(declarations,
292                                   maxDeclarations
293                                   * sizeof(struct tgsi_full_declaration),
294                                   (maxDeclarations + 10)
295                                   * sizeof(struct tgsi_full_declaration));
296            maxDeclarations += 10;
297         }
298         memcpy(declarations + numDeclarations,
299                &parse.FullToken.FullDeclaration,
300                sizeof(declarations[0]));
301         numDeclarations++;
302         break;
303
304      case TGSI_TOKEN_TYPE_IMMEDIATE:
305         {
306            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
307            assert( size <= 4 );
308            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
309
310            for( i = 0; i < size; i++ ) {
311               mach->Imms[mach->ImmLimit][i] =
312		  parse.FullToken.FullImmediate.u[i].Float;
313            }
314            mach->ImmLimit += 1;
315         }
316         break;
317
318      case TGSI_TOKEN_TYPE_INSTRUCTION:
319         assert( labels->count < MAX_LABELS );
320
321         labels->labels[labels->count][0] = instno;
322         labels->labels[labels->count][1] = pointer;
323         labels->count++;
324
325         /* save expanded instruction */
326         if (numInstructions == maxInstructions) {
327            instructions = REALLOC(instructions,
328                                   maxInstructions
329                                   * sizeof(struct tgsi_full_instruction),
330                                   (maxInstructions + 10)
331                                   * sizeof(struct tgsi_full_instruction));
332            maxInstructions += 10;
333         }
334
335         if (tgsi_check_soa_dependencies(&parse.FullToken.FullInstruction)) {
336            uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
337            parse.FullToken.FullInstruction.Flags = SOA_DEPENDENCY_FLAG;
338            /* XXX we only handle SOA dependencies properly for MOV/SWZ
339             * at this time!
340             */
341            if (opcode != TGSI_OPCODE_MOV && opcode != TGSI_OPCODE_SWZ) {
342               debug_printf("Warning: SOA dependency in instruction"
343                            " is not handled:\n");
344               tgsi_dump_instruction(&parse.FullToken.FullInstruction,
345                                     numInstructions);
346            }
347         }
348
349         memcpy(instructions + numInstructions,
350                &parse.FullToken.FullInstruction,
351                sizeof(instructions[0]));
352
353         numInstructions++;
354         break;
355
356      default:
357         assert( 0 );
358      }
359   }
360   tgsi_parse_free (&parse);
361
362   if (mach->Declarations) {
363      FREE( mach->Declarations );
364   }
365   mach->Declarations = declarations;
366   mach->NumDeclarations = numDeclarations;
367
368   if (mach->Instructions) {
369      FREE( mach->Instructions );
370   }
371   mach->Instructions = instructions;
372   mach->NumInstructions = numInstructions;
373}
374
375
376struct tgsi_exec_machine *
377tgsi_exec_machine_create( void )
378{
379   struct tgsi_exec_machine *mach;
380   uint i;
381
382   mach = align_malloc( sizeof *mach, 16 );
383   if (!mach)
384      goto fail;
385
386   memset(mach, 0, sizeof(*mach));
387
388   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
389
390   /* Setup constants. */
391   for( i = 0; i < 4; i++ ) {
392      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
393      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
394      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
395      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
396      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
397      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
398      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
399      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
400      mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
401      mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
402   }
403
404#ifdef DEBUG
405   /* silence warnings */
406   (void) print_chan;
407   (void) print_temp;
408#endif
409
410   return mach;
411
412fail:
413   align_free(mach);
414   return NULL;
415}
416
417
418void
419tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
420{
421   if (mach) {
422      FREE(mach->Instructions);
423      FREE(mach->Declarations);
424   }
425
426   align_free(mach);
427}
428
429
430static void
431micro_abs(
432   union tgsi_exec_channel *dst,
433   const union tgsi_exec_channel *src )
434{
435   dst->f[0] = fabsf( src->f[0] );
436   dst->f[1] = fabsf( src->f[1] );
437   dst->f[2] = fabsf( src->f[2] );
438   dst->f[3] = fabsf( src->f[3] );
439}
440
441static void
442micro_add(
443   union tgsi_exec_channel *dst,
444   const union tgsi_exec_channel *src0,
445   const union tgsi_exec_channel *src1 )
446{
447   dst->f[0] = src0->f[0] + src1->f[0];
448   dst->f[1] = src0->f[1] + src1->f[1];
449   dst->f[2] = src0->f[2] + src1->f[2];
450   dst->f[3] = src0->f[3] + src1->f[3];
451}
452
453#if 0
454static void
455micro_iadd(
456   union tgsi_exec_channel *dst,
457   const union tgsi_exec_channel *src0,
458   const union tgsi_exec_channel *src1 )
459{
460   dst->i[0] = src0->i[0] + src1->i[0];
461   dst->i[1] = src0->i[1] + src1->i[1];
462   dst->i[2] = src0->i[2] + src1->i[2];
463   dst->i[3] = src0->i[3] + src1->i[3];
464}
465#endif
466
467static void
468micro_and(
469   union tgsi_exec_channel *dst,
470   const union tgsi_exec_channel *src0,
471   const union tgsi_exec_channel *src1 )
472{
473   dst->u[0] = src0->u[0] & src1->u[0];
474   dst->u[1] = src0->u[1] & src1->u[1];
475   dst->u[2] = src0->u[2] & src1->u[2];
476   dst->u[3] = src0->u[3] & src1->u[3];
477}
478
479static void
480micro_ceil(
481   union tgsi_exec_channel *dst,
482   const union tgsi_exec_channel *src )
483{
484   dst->f[0] = ceilf( src->f[0] );
485   dst->f[1] = ceilf( src->f[1] );
486   dst->f[2] = ceilf( src->f[2] );
487   dst->f[3] = ceilf( src->f[3] );
488}
489
490static void
491micro_cos(
492   union tgsi_exec_channel *dst,
493   const union tgsi_exec_channel *src )
494{
495   dst->f[0] = cosf( src->f[0] );
496   dst->f[1] = cosf( src->f[1] );
497   dst->f[2] = cosf( src->f[2] );
498   dst->f[3] = cosf( src->f[3] );
499}
500
501static void
502micro_ddx(
503   union tgsi_exec_channel *dst,
504   const union tgsi_exec_channel *src )
505{
506   dst->f[0] =
507   dst->f[1] =
508   dst->f[2] =
509   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
510}
511
512static void
513micro_ddy(
514   union tgsi_exec_channel *dst,
515   const union tgsi_exec_channel *src )
516{
517   dst->f[0] =
518   dst->f[1] =
519   dst->f[2] =
520   dst->f[3] = src->f[TILE_TOP_LEFT] - src->f[TILE_BOTTOM_LEFT];
521}
522
523static void
524micro_div(
525   union tgsi_exec_channel *dst,
526   const union tgsi_exec_channel *src0,
527   const union tgsi_exec_channel *src1 )
528{
529   if (src1->f[0] != 0) {
530      dst->f[0] = src0->f[0] / src1->f[0];
531   }
532   if (src1->f[1] != 0) {
533      dst->f[1] = src0->f[1] / src1->f[1];
534   }
535   if (src1->f[2] != 0) {
536      dst->f[2] = src0->f[2] / src1->f[2];
537   }
538   if (src1->f[3] != 0) {
539      dst->f[3] = src0->f[3] / src1->f[3];
540   }
541}
542
543#if 0
544static void
545micro_udiv(
546   union tgsi_exec_channel *dst,
547   const union tgsi_exec_channel *src0,
548   const union tgsi_exec_channel *src1 )
549{
550   dst->u[0] = src0->u[0] / src1->u[0];
551   dst->u[1] = src0->u[1] / src1->u[1];
552   dst->u[2] = src0->u[2] / src1->u[2];
553   dst->u[3] = src0->u[3] / src1->u[3];
554}
555#endif
556
557static void
558micro_eq(
559   union tgsi_exec_channel *dst,
560   const union tgsi_exec_channel *src0,
561   const union tgsi_exec_channel *src1,
562   const union tgsi_exec_channel *src2,
563   const union tgsi_exec_channel *src3 )
564{
565   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
566   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
567   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
568   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
569}
570
571#if 0
572static void
573micro_ieq(
574   union tgsi_exec_channel *dst,
575   const union tgsi_exec_channel *src0,
576   const union tgsi_exec_channel *src1,
577   const union tgsi_exec_channel *src2,
578   const union tgsi_exec_channel *src3 )
579{
580   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
581   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
582   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
583   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
584}
585#endif
586
587static void
588micro_exp2(
589   union tgsi_exec_channel *dst,
590   const union tgsi_exec_channel *src)
591{
592#if FAST_MATH
593   dst->f[0] = util_fast_exp2( src->f[0] );
594   dst->f[1] = util_fast_exp2( src->f[1] );
595   dst->f[2] = util_fast_exp2( src->f[2] );
596   dst->f[3] = util_fast_exp2( src->f[3] );
597#else
598   dst->f[0] = powf( 2.0f, src->f[0] );
599   dst->f[1] = powf( 2.0f, src->f[1] );
600   dst->f[2] = powf( 2.0f, src->f[2] );
601   dst->f[3] = powf( 2.0f, src->f[3] );
602#endif
603}
604
605#if 0
606static void
607micro_f2ut(
608   union tgsi_exec_channel *dst,
609   const union tgsi_exec_channel *src )
610{
611   dst->u[0] = (uint) src->f[0];
612   dst->u[1] = (uint) src->f[1];
613   dst->u[2] = (uint) src->f[2];
614   dst->u[3] = (uint) src->f[3];
615}
616#endif
617
618static void
619micro_float_clamp(union tgsi_exec_channel *dst,
620                  const union tgsi_exec_channel *src)
621{
622   uint i;
623
624   for (i = 0; i < 4; i++) {
625      if (src->f[i] > 0.0f) {
626         if (src->f[i] > 1.884467e+019f)
627            dst->f[i] = 1.884467e+019f;
628         else if (src->f[i] < 5.42101e-020f)
629            dst->f[i] = 5.42101e-020f;
630         else
631            dst->f[i] = src->f[i];
632      }
633      else {
634         if (src->f[i] < -1.884467e+019f)
635            dst->f[i] = -1.884467e+019f;
636         else if (src->f[i] > -5.42101e-020f)
637            dst->f[i] = -5.42101e-020f;
638         else
639            dst->f[i] = src->f[i];
640      }
641   }
642}
643
644static void
645micro_flr(
646   union tgsi_exec_channel *dst,
647   const union tgsi_exec_channel *src )
648{
649   dst->f[0] = floorf( src->f[0] );
650   dst->f[1] = floorf( src->f[1] );
651   dst->f[2] = floorf( src->f[2] );
652   dst->f[3] = floorf( src->f[3] );
653}
654
655static void
656micro_frc(
657   union tgsi_exec_channel *dst,
658   const union tgsi_exec_channel *src )
659{
660   dst->f[0] = src->f[0] - floorf( src->f[0] );
661   dst->f[1] = src->f[1] - floorf( src->f[1] );
662   dst->f[2] = src->f[2] - floorf( src->f[2] );
663   dst->f[3] = src->f[3] - floorf( src->f[3] );
664}
665
666static void
667micro_i2f(
668   union tgsi_exec_channel *dst,
669   const union tgsi_exec_channel *src )
670{
671   dst->f[0] = (float) src->i[0];
672   dst->f[1] = (float) src->i[1];
673   dst->f[2] = (float) src->i[2];
674   dst->f[3] = (float) src->i[3];
675}
676
677static void
678micro_lg2(
679   union tgsi_exec_channel *dst,
680   const union tgsi_exec_channel *src )
681{
682#if FAST_MATH
683   dst->f[0] = util_fast_log2( src->f[0] );
684   dst->f[1] = util_fast_log2( src->f[1] );
685   dst->f[2] = util_fast_log2( src->f[2] );
686   dst->f[3] = util_fast_log2( src->f[3] );
687#else
688   dst->f[0] = logf( src->f[0] ) * 1.442695f;
689   dst->f[1] = logf( src->f[1] ) * 1.442695f;
690   dst->f[2] = logf( src->f[2] ) * 1.442695f;
691   dst->f[3] = logf( src->f[3] ) * 1.442695f;
692#endif
693}
694
695static void
696micro_le(
697   union tgsi_exec_channel *dst,
698   const union tgsi_exec_channel *src0,
699   const union tgsi_exec_channel *src1,
700   const union tgsi_exec_channel *src2,
701   const union tgsi_exec_channel *src3 )
702{
703   dst->f[0] = src0->f[0] <= src1->f[0] ? src2->f[0] : src3->f[0];
704   dst->f[1] = src0->f[1] <= src1->f[1] ? src2->f[1] : src3->f[1];
705   dst->f[2] = src0->f[2] <= src1->f[2] ? src2->f[2] : src3->f[2];
706   dst->f[3] = src0->f[3] <= src1->f[3] ? src2->f[3] : src3->f[3];
707}
708
709static void
710micro_lt(
711   union tgsi_exec_channel *dst,
712   const union tgsi_exec_channel *src0,
713   const union tgsi_exec_channel *src1,
714   const union tgsi_exec_channel *src2,
715   const union tgsi_exec_channel *src3 )
716{
717   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
718   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
719   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
720   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
721}
722
723#if 0
724static void
725micro_ilt(
726   union tgsi_exec_channel *dst,
727   const union tgsi_exec_channel *src0,
728   const union tgsi_exec_channel *src1,
729   const union tgsi_exec_channel *src2,
730   const union tgsi_exec_channel *src3 )
731{
732   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
733   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
734   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
735   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
736}
737#endif
738
739#if 0
740static void
741micro_ult(
742   union tgsi_exec_channel *dst,
743   const union tgsi_exec_channel *src0,
744   const union tgsi_exec_channel *src1,
745   const union tgsi_exec_channel *src2,
746   const union tgsi_exec_channel *src3 )
747{
748   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
749   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
750   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
751   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
752}
753#endif
754
755static void
756micro_max(
757   union tgsi_exec_channel *dst,
758   const union tgsi_exec_channel *src0,
759   const union tgsi_exec_channel *src1 )
760{
761   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
762   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
763   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
764   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
765}
766
767#if 0
768static void
769micro_imax(
770   union tgsi_exec_channel *dst,
771   const union tgsi_exec_channel *src0,
772   const union tgsi_exec_channel *src1 )
773{
774   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
775   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
776   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
777   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
778}
779#endif
780
781#if 0
782static void
783micro_umax(
784   union tgsi_exec_channel *dst,
785   const union tgsi_exec_channel *src0,
786   const union tgsi_exec_channel *src1 )
787{
788   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
789   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
790   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
791   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
792}
793#endif
794
795static void
796micro_min(
797   union tgsi_exec_channel *dst,
798   const union tgsi_exec_channel *src0,
799   const union tgsi_exec_channel *src1 )
800{
801   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
802   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
803   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
804   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
805}
806
807#if 0
808static void
809micro_imin(
810   union tgsi_exec_channel *dst,
811   const union tgsi_exec_channel *src0,
812   const union tgsi_exec_channel *src1 )
813{
814   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
815   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
816   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
817   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
818}
819#endif
820
821#if 0
822static void
823micro_umin(
824   union tgsi_exec_channel *dst,
825   const union tgsi_exec_channel *src0,
826   const union tgsi_exec_channel *src1 )
827{
828   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
829   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
830   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
831   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
832}
833#endif
834
835#if 0
836static void
837micro_umod(
838   union tgsi_exec_channel *dst,
839   const union tgsi_exec_channel *src0,
840   const union tgsi_exec_channel *src1 )
841{
842   dst->u[0] = src0->u[0] % src1->u[0];
843   dst->u[1] = src0->u[1] % src1->u[1];
844   dst->u[2] = src0->u[2] % src1->u[2];
845   dst->u[3] = src0->u[3] % src1->u[3];
846}
847#endif
848
849static void
850micro_mul(
851   union tgsi_exec_channel *dst,
852   const union tgsi_exec_channel *src0,
853   const union tgsi_exec_channel *src1 )
854{
855   dst->f[0] = src0->f[0] * src1->f[0];
856   dst->f[1] = src0->f[1] * src1->f[1];
857   dst->f[2] = src0->f[2] * src1->f[2];
858   dst->f[3] = src0->f[3] * src1->f[3];
859}
860
861#if 0
862static void
863micro_imul(
864   union tgsi_exec_channel *dst,
865   const union tgsi_exec_channel *src0,
866   const union tgsi_exec_channel *src1 )
867{
868   dst->i[0] = src0->i[0] * src1->i[0];
869   dst->i[1] = src0->i[1] * src1->i[1];
870   dst->i[2] = src0->i[2] * src1->i[2];
871   dst->i[3] = src0->i[3] * src1->i[3];
872}
873#endif
874
875#if 0
876static void
877micro_imul64(
878   union tgsi_exec_channel *dst0,
879   union tgsi_exec_channel *dst1,
880   const union tgsi_exec_channel *src0,
881   const union tgsi_exec_channel *src1 )
882{
883   dst1->i[0] = src0->i[0] * src1->i[0];
884   dst1->i[1] = src0->i[1] * src1->i[1];
885   dst1->i[2] = src0->i[2] * src1->i[2];
886   dst1->i[3] = src0->i[3] * src1->i[3];
887   dst0->i[0] = 0;
888   dst0->i[1] = 0;
889   dst0->i[2] = 0;
890   dst0->i[3] = 0;
891}
892#endif
893
894#if 0
895static void
896micro_umul64(
897   union tgsi_exec_channel *dst0,
898   union tgsi_exec_channel *dst1,
899   const union tgsi_exec_channel *src0,
900   const union tgsi_exec_channel *src1 )
901{
902   dst1->u[0] = src0->u[0] * src1->u[0];
903   dst1->u[1] = src0->u[1] * src1->u[1];
904   dst1->u[2] = src0->u[2] * src1->u[2];
905   dst1->u[3] = src0->u[3] * src1->u[3];
906   dst0->u[0] = 0;
907   dst0->u[1] = 0;
908   dst0->u[2] = 0;
909   dst0->u[3] = 0;
910}
911#endif
912
913
914#if 0
915static void
916micro_movc(
917   union tgsi_exec_channel *dst,
918   const union tgsi_exec_channel *src0,
919   const union tgsi_exec_channel *src1,
920   const union tgsi_exec_channel *src2 )
921{
922   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
923   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
924   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
925   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
926}
927#endif
928
929static void
930micro_neg(
931   union tgsi_exec_channel *dst,
932   const union tgsi_exec_channel *src )
933{
934   dst->f[0] = -src->f[0];
935   dst->f[1] = -src->f[1];
936   dst->f[2] = -src->f[2];
937   dst->f[3] = -src->f[3];
938}
939
940#if 0
941static void
942micro_ineg(
943   union tgsi_exec_channel *dst,
944   const union tgsi_exec_channel *src )
945{
946   dst->i[0] = -src->i[0];
947   dst->i[1] = -src->i[1];
948   dst->i[2] = -src->i[2];
949   dst->i[3] = -src->i[3];
950}
951#endif
952
953static void
954micro_not(
955   union tgsi_exec_channel *dst,
956   const union tgsi_exec_channel *src )
957{
958   dst->u[0] = ~src->u[0];
959   dst->u[1] = ~src->u[1];
960   dst->u[2] = ~src->u[2];
961   dst->u[3] = ~src->u[3];
962}
963
964static void
965micro_or(
966   union tgsi_exec_channel *dst,
967   const union tgsi_exec_channel *src0,
968   const union tgsi_exec_channel *src1 )
969{
970   dst->u[0] = src0->u[0] | src1->u[0];
971   dst->u[1] = src0->u[1] | src1->u[1];
972   dst->u[2] = src0->u[2] | src1->u[2];
973   dst->u[3] = src0->u[3] | src1->u[3];
974}
975
976static void
977micro_pow(
978   union tgsi_exec_channel *dst,
979   const union tgsi_exec_channel *src0,
980   const union tgsi_exec_channel *src1 )
981{
982#if FAST_MATH
983   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
984   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
985   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
986   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
987#else
988   dst->f[0] = powf( src0->f[0], src1->f[0] );
989   dst->f[1] = powf( src0->f[1], src1->f[1] );
990   dst->f[2] = powf( src0->f[2], src1->f[2] );
991   dst->f[3] = powf( src0->f[3], src1->f[3] );
992#endif
993}
994
995static void
996micro_rnd(
997   union tgsi_exec_channel *dst,
998   const union tgsi_exec_channel *src )
999{
1000   dst->f[0] = floorf( src->f[0] + 0.5f );
1001   dst->f[1] = floorf( src->f[1] + 0.5f );
1002   dst->f[2] = floorf( src->f[2] + 0.5f );
1003   dst->f[3] = floorf( src->f[3] + 0.5f );
1004}
1005
1006static void
1007micro_sgn(
1008   union tgsi_exec_channel *dst,
1009   const union tgsi_exec_channel *src )
1010{
1011   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
1012   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
1013   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
1014   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
1015}
1016
1017static void
1018micro_shl(
1019   union tgsi_exec_channel *dst,
1020   const union tgsi_exec_channel *src0,
1021   const union tgsi_exec_channel *src1 )
1022{
1023   dst->i[0] = src0->i[0] << src1->i[0];
1024   dst->i[1] = src0->i[1] << src1->i[1];
1025   dst->i[2] = src0->i[2] << src1->i[2];
1026   dst->i[3] = src0->i[3] << src1->i[3];
1027}
1028
1029static void
1030micro_ishr(
1031   union tgsi_exec_channel *dst,
1032   const union tgsi_exec_channel *src0,
1033   const union tgsi_exec_channel *src1 )
1034{
1035   dst->i[0] = src0->i[0] >> src1->i[0];
1036   dst->i[1] = src0->i[1] >> src1->i[1];
1037   dst->i[2] = src0->i[2] >> src1->i[2];
1038   dst->i[3] = src0->i[3] >> src1->i[3];
1039}
1040
1041static void
1042micro_trunc(
1043   union tgsi_exec_channel *dst,
1044   const union tgsi_exec_channel *src0 )
1045{
1046   dst->f[0] = (float) (int) src0->f[0];
1047   dst->f[1] = (float) (int) src0->f[1];
1048   dst->f[2] = (float) (int) src0->f[2];
1049   dst->f[3] = (float) (int) src0->f[3];
1050}
1051
1052#if 0
1053static void
1054micro_ushr(
1055   union tgsi_exec_channel *dst,
1056   const union tgsi_exec_channel *src0,
1057   const union tgsi_exec_channel *src1 )
1058{
1059   dst->u[0] = src0->u[0] >> src1->u[0];
1060   dst->u[1] = src0->u[1] >> src1->u[1];
1061   dst->u[2] = src0->u[2] >> src1->u[2];
1062   dst->u[3] = src0->u[3] >> src1->u[3];
1063}
1064#endif
1065
1066static void
1067micro_sin(
1068   union tgsi_exec_channel *dst,
1069   const union tgsi_exec_channel *src )
1070{
1071   dst->f[0] = sinf( src->f[0] );
1072   dst->f[1] = sinf( src->f[1] );
1073   dst->f[2] = sinf( src->f[2] );
1074   dst->f[3] = sinf( src->f[3] );
1075}
1076
1077static void
1078micro_sqrt( union tgsi_exec_channel *dst,
1079            const union tgsi_exec_channel *src )
1080{
1081   dst->f[0] = sqrtf( src->f[0] );
1082   dst->f[1] = sqrtf( src->f[1] );
1083   dst->f[2] = sqrtf( src->f[2] );
1084   dst->f[3] = sqrtf( src->f[3] );
1085}
1086
1087static void
1088micro_sub(
1089   union tgsi_exec_channel *dst,
1090   const union tgsi_exec_channel *src0,
1091   const union tgsi_exec_channel *src1 )
1092{
1093   dst->f[0] = src0->f[0] - src1->f[0];
1094   dst->f[1] = src0->f[1] - src1->f[1];
1095   dst->f[2] = src0->f[2] - src1->f[2];
1096   dst->f[3] = src0->f[3] - src1->f[3];
1097}
1098
1099#if 0
1100static void
1101micro_u2f(
1102   union tgsi_exec_channel *dst,
1103   const union tgsi_exec_channel *src )
1104{
1105   dst->f[0] = (float) src->u[0];
1106   dst->f[1] = (float) src->u[1];
1107   dst->f[2] = (float) src->u[2];
1108   dst->f[3] = (float) src->u[3];
1109}
1110#endif
1111
1112static void
1113micro_xor(
1114   union tgsi_exec_channel *dst,
1115   const union tgsi_exec_channel *src0,
1116   const union tgsi_exec_channel *src1 )
1117{
1118   dst->u[0] = src0->u[0] ^ src1->u[0];
1119   dst->u[1] = src0->u[1] ^ src1->u[1];
1120   dst->u[2] = src0->u[2] ^ src1->u[2];
1121   dst->u[3] = src0->u[3] ^ src1->u[3];
1122}
1123
1124static void
1125fetch_src_file_channel(
1126   const struct tgsi_exec_machine *mach,
1127   const uint file,
1128   const uint swizzle,
1129   const union tgsi_exec_channel *index,
1130   union tgsi_exec_channel *chan )
1131{
1132   switch( swizzle ) {
1133   case TGSI_EXTSWIZZLE_X:
1134   case TGSI_EXTSWIZZLE_Y:
1135   case TGSI_EXTSWIZZLE_Z:
1136   case TGSI_EXTSWIZZLE_W:
1137      switch( file ) {
1138      case TGSI_FILE_CONSTANT:
1139         assert(mach->Consts);
1140         if (index->i[0] < 0)
1141            chan->f[0] = 0.0f;
1142         else
1143            chan->f[0] = mach->Consts[index->i[0]][swizzle];
1144         if (index->i[1] < 0)
1145            chan->f[1] = 0.0f;
1146         else
1147            chan->f[1] = mach->Consts[index->i[1]][swizzle];
1148         if (index->i[2] < 0)
1149            chan->f[2] = 0.0f;
1150         else
1151            chan->f[2] = mach->Consts[index->i[2]][swizzle];
1152         if (index->i[3] < 0)
1153            chan->f[3] = 0.0f;
1154         else
1155            chan->f[3] = mach->Consts[index->i[3]][swizzle];
1156         break;
1157
1158      case TGSI_FILE_INPUT:
1159         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
1160         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
1161         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
1162         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
1163         break;
1164
1165      case TGSI_FILE_TEMPORARY:
1166         assert(index->i[0] < TGSI_EXEC_NUM_TEMPS);
1167         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
1168         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
1169         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
1170         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
1171         break;
1172
1173      case TGSI_FILE_IMMEDIATE:
1174         assert( index->i[0] < (int) mach->ImmLimit );
1175         chan->f[0] = mach->Imms[index->i[0]][swizzle];
1176         assert( index->i[1] < (int) mach->ImmLimit );
1177         chan->f[1] = mach->Imms[index->i[1]][swizzle];
1178         assert( index->i[2] < (int) mach->ImmLimit );
1179         chan->f[2] = mach->Imms[index->i[2]][swizzle];
1180         assert( index->i[3] < (int) mach->ImmLimit );
1181         chan->f[3] = mach->Imms[index->i[3]][swizzle];
1182         break;
1183
1184      case TGSI_FILE_ADDRESS:
1185         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
1186         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
1187         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
1188         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
1189         break;
1190
1191      case TGSI_FILE_OUTPUT:
1192         /* vertex/fragment output vars can be read too */
1193         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
1194         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
1195         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
1196         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
1197         break;
1198
1199      default:
1200         assert( 0 );
1201      }
1202      break;
1203
1204   case TGSI_EXTSWIZZLE_ZERO:
1205      *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C];
1206      break;
1207
1208   case TGSI_EXTSWIZZLE_ONE:
1209      *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C];
1210      break;
1211
1212   default:
1213      assert( 0 );
1214   }
1215}
1216
1217static void
1218fetch_source(
1219   const struct tgsi_exec_machine *mach,
1220   union tgsi_exec_channel *chan,
1221   const struct tgsi_full_src_register *reg,
1222   const uint chan_index )
1223{
1224   union tgsi_exec_channel index;
1225   uint swizzle;
1226
1227   /* We start with a direct index into a register file.
1228    *
1229    *    file[1],
1230    *    where:
1231    *       file = SrcRegister.File
1232    *       [1] = SrcRegister.Index
1233    */
1234   index.i[0] =
1235   index.i[1] =
1236   index.i[2] =
1237   index.i[3] = reg->SrcRegister.Index;
1238
1239   /* There is an extra source register that indirectly subscripts
1240    * a register file. The direct index now becomes an offset
1241    * that is being added to the indirect register.
1242    *
1243    *    file[ind[2].x+1],
1244    *    where:
1245    *       ind = SrcRegisterInd.File
1246    *       [2] = SrcRegisterInd.Index
1247    *       .x = SrcRegisterInd.SwizzleX
1248    */
1249   if (reg->SrcRegister.Indirect) {
1250      union tgsi_exec_channel index2;
1251      union tgsi_exec_channel indir_index;
1252      const uint execmask = mach->ExecMask;
1253      uint i;
1254
1255      /* which address register (always zero now) */
1256      index2.i[0] =
1257      index2.i[1] =
1258      index2.i[2] =
1259      index2.i[3] = reg->SrcRegisterInd.Index;
1260
1261      /* get current value of address register[swizzle] */
1262      swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterInd, CHAN_X );
1263      fetch_src_file_channel(
1264         mach,
1265         reg->SrcRegisterInd.File,
1266         swizzle,
1267         &index2,
1268         &indir_index );
1269
1270      /* add value of address register to the offset */
1271      index.i[0] += (int) indir_index.f[0];
1272      index.i[1] += (int) indir_index.f[1];
1273      index.i[2] += (int) indir_index.f[2];
1274      index.i[3] += (int) indir_index.f[3];
1275
1276      /* for disabled execution channels, zero-out the index to
1277       * avoid using a potential garbage value.
1278       */
1279      for (i = 0; i < QUAD_SIZE; i++) {
1280         if ((execmask & (1 << i)) == 0)
1281            index.i[i] = 0;
1282      }
1283   }
1284
1285   /* There is an extra source register that is a second
1286    * subscript to a register file. Effectively it means that
1287    * the register file is actually a 2D array of registers.
1288    *
1289    *    file[1][3] == file[1*sizeof(file[1])+3],
1290    *    where:
1291    *       [3] = SrcRegisterDim.Index
1292    */
1293   if (reg->SrcRegister.Dimension) {
1294      /* The size of the first-order array depends on the register file type.
1295       * We need to multiply the index to the first array to get an effective,
1296       * "flat" index that points to the beginning of the second-order array.
1297       */
1298      switch (reg->SrcRegister.File) {
1299      case TGSI_FILE_INPUT:
1300         index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1301         index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1302         index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1303         index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1304         break;
1305      case TGSI_FILE_CONSTANT:
1306         index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
1307         index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
1308         index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
1309         index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
1310         break;
1311      default:
1312         assert( 0 );
1313      }
1314
1315      index.i[0] += reg->SrcRegisterDim.Index;
1316      index.i[1] += reg->SrcRegisterDim.Index;
1317      index.i[2] += reg->SrcRegisterDim.Index;
1318      index.i[3] += reg->SrcRegisterDim.Index;
1319
1320      /* Again, the second subscript index can be addressed indirectly
1321       * identically to the first one.
1322       * Nothing stops us from indirectly addressing the indirect register,
1323       * but there is no need for that, so we won't exercise it.
1324       *
1325       *    file[1][ind[4].y+3],
1326       *    where:
1327       *       ind = SrcRegisterDimInd.File
1328       *       [4] = SrcRegisterDimInd.Index
1329       *       .y = SrcRegisterDimInd.SwizzleX
1330       */
1331      if (reg->SrcRegisterDim.Indirect) {
1332         union tgsi_exec_channel index2;
1333         union tgsi_exec_channel indir_index;
1334         const uint execmask = mach->ExecMask;
1335         uint i;
1336
1337         index2.i[0] =
1338         index2.i[1] =
1339         index2.i[2] =
1340         index2.i[3] = reg->SrcRegisterDimInd.Index;
1341
1342         swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
1343         fetch_src_file_channel(
1344            mach,
1345            reg->SrcRegisterDimInd.File,
1346            swizzle,
1347            &index2,
1348            &indir_index );
1349
1350         index.i[0] += (int) indir_index.f[0];
1351         index.i[1] += (int) indir_index.f[1];
1352         index.i[2] += (int) indir_index.f[2];
1353         index.i[3] += (int) indir_index.f[3];
1354
1355         /* for disabled execution channels, zero-out the index to
1356          * avoid using a potential garbage value.
1357          */
1358         for (i = 0; i < QUAD_SIZE; i++) {
1359            if ((execmask & (1 << i)) == 0)
1360               index.i[i] = 0;
1361         }
1362      }
1363
1364      /* If by any chance there was a need for a 3D array of register
1365       * files, we would have to check whether SrcRegisterDim is followed
1366       * by a dimension register and continue the saga.
1367       */
1368   }
1369
1370   swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1371   fetch_src_file_channel(
1372      mach,
1373      reg->SrcRegister.File,
1374      swizzle,
1375      &index,
1376      chan );
1377
1378   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
1379   case TGSI_UTIL_SIGN_CLEAR:
1380      micro_abs( chan, chan );
1381      break;
1382
1383   case TGSI_UTIL_SIGN_SET:
1384      micro_abs( chan, chan );
1385      micro_neg( chan, chan );
1386      break;
1387
1388   case TGSI_UTIL_SIGN_TOGGLE:
1389      micro_neg( chan, chan );
1390      break;
1391
1392   case TGSI_UTIL_SIGN_KEEP:
1393      break;
1394   }
1395
1396   if (reg->SrcRegisterExtMod.Complement) {
1397      micro_sub( chan, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], chan );
1398   }
1399}
1400
1401static void
1402store_dest(
1403   struct tgsi_exec_machine *mach,
1404   const union tgsi_exec_channel *chan,
1405   const struct tgsi_full_dst_register *reg,
1406   const struct tgsi_full_instruction *inst,
1407   uint chan_index )
1408{
1409   uint i;
1410   union tgsi_exec_channel null;
1411   union tgsi_exec_channel *dst;
1412   uint execmask = mach->ExecMask;
1413   int offset = 0;  /* indirection offset */
1414   int index;
1415
1416#ifdef DEBUG
1417   check_inf_or_nan(chan);
1418#endif
1419
1420   /* There is an extra source register that indirectly subscripts
1421    * a register file. The direct index now becomes an offset
1422    * that is being added to the indirect register.
1423    *
1424    *    file[ind[2].x+1],
1425    *    where:
1426    *       ind = DstRegisterInd.File
1427    *       [2] = DstRegisterInd.Index
1428    *       .x = DstRegisterInd.SwizzleX
1429    */
1430   if (reg->DstRegister.Indirect) {
1431      union tgsi_exec_channel index;
1432      union tgsi_exec_channel indir_index;
1433      uint swizzle;
1434
1435      /* which address register (always zero for now) */
1436      index.i[0] =
1437      index.i[1] =
1438      index.i[2] =
1439      index.i[3] = reg->DstRegisterInd.Index;
1440
1441      /* get current value of address register[swizzle] */
1442      swizzle = tgsi_util_get_src_register_swizzle( &reg->DstRegisterInd, CHAN_X );
1443
1444      /* fetch values from the address/indirection register */
1445      fetch_src_file_channel(
1446         mach,
1447         reg->DstRegisterInd.File,
1448         swizzle,
1449         &index,
1450         &indir_index );
1451
1452      /* save indirection offset */
1453      offset = (int) indir_index.f[0];
1454   }
1455
1456   switch (reg->DstRegister.File) {
1457   case TGSI_FILE_NULL:
1458      dst = &null;
1459      break;
1460
1461   case TGSI_FILE_OUTPUT:
1462      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1463         + reg->DstRegister.Index;
1464      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1465      break;
1466
1467   case TGSI_FILE_TEMPORARY:
1468      index = reg->DstRegister.Index;
1469      assert( index < TGSI_EXEC_NUM_TEMPS );
1470      dst = &mach->Temps[offset + index].xyzw[chan_index];
1471      break;
1472
1473   case TGSI_FILE_ADDRESS:
1474      index = reg->DstRegister.Index;
1475      dst = &mach->Addrs[index].xyzw[chan_index];
1476      break;
1477
1478   default:
1479      assert( 0 );
1480      return;
1481   }
1482
1483   if (inst->InstructionExtNv.CondFlowEnable) {
1484      union tgsi_exec_channel *cc = &mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C];
1485      uint swizzle;
1486      uint shift;
1487      uint mask;
1488      uint test;
1489
1490      /* Only CC0 supported.
1491       */
1492      assert( inst->InstructionExtNv.CondFlowIndex < 1 );
1493
1494      switch (chan_index) {
1495      case CHAN_X:
1496         swizzle = inst->InstructionExtNv.CondSwizzleX;
1497         break;
1498      case CHAN_Y:
1499         swizzle = inst->InstructionExtNv.CondSwizzleY;
1500         break;
1501      case CHAN_Z:
1502         swizzle = inst->InstructionExtNv.CondSwizzleZ;
1503         break;
1504      case CHAN_W:
1505         swizzle = inst->InstructionExtNv.CondSwizzleW;
1506         break;
1507      default:
1508         assert( 0 );
1509         return;
1510      }
1511
1512      switch (swizzle) {
1513      case TGSI_SWIZZLE_X:
1514         shift = TGSI_EXEC_CC_X_SHIFT;
1515         mask = TGSI_EXEC_CC_X_MASK;
1516         break;
1517      case TGSI_SWIZZLE_Y:
1518         shift = TGSI_EXEC_CC_Y_SHIFT;
1519         mask = TGSI_EXEC_CC_Y_MASK;
1520         break;
1521      case TGSI_SWIZZLE_Z:
1522         shift = TGSI_EXEC_CC_Z_SHIFT;
1523         mask = TGSI_EXEC_CC_Z_MASK;
1524         break;
1525      case TGSI_SWIZZLE_W:
1526         shift = TGSI_EXEC_CC_W_SHIFT;
1527         mask = TGSI_EXEC_CC_W_MASK;
1528         break;
1529      default:
1530         assert( 0 );
1531         return;
1532      }
1533
1534      switch (inst->InstructionExtNv.CondMask) {
1535      case TGSI_CC_GT:
1536         test = ~(TGSI_EXEC_CC_GT << shift) & mask;
1537         for (i = 0; i < QUAD_SIZE; i++)
1538            if (cc->u[i] & test)
1539               execmask &= ~(1 << i);
1540         break;
1541
1542      case TGSI_CC_EQ:
1543         test = ~(TGSI_EXEC_CC_EQ << shift) & mask;
1544         for (i = 0; i < QUAD_SIZE; i++)
1545            if (cc->u[i] & test)
1546               execmask &= ~(1 << i);
1547         break;
1548
1549      case TGSI_CC_LT:
1550         test = ~(TGSI_EXEC_CC_LT << shift) & mask;
1551         for (i = 0; i < QUAD_SIZE; i++)
1552            if (cc->u[i] & test)
1553               execmask &= ~(1 << i);
1554         break;
1555
1556      case TGSI_CC_GE:
1557         test = ~((TGSI_EXEC_CC_GT | TGSI_EXEC_CC_EQ) << shift) & mask;
1558         for (i = 0; i < QUAD_SIZE; i++)
1559            if (cc->u[i] & test)
1560               execmask &= ~(1 << i);
1561         break;
1562
1563      case TGSI_CC_LE:
1564         test = ~((TGSI_EXEC_CC_LT | TGSI_EXEC_CC_EQ) << shift) & mask;
1565         for (i = 0; i < QUAD_SIZE; i++)
1566            if (cc->u[i] & test)
1567               execmask &= ~(1 << i);
1568         break;
1569
1570      case TGSI_CC_NE:
1571         test = ~((TGSI_EXEC_CC_GT | TGSI_EXEC_CC_LT | TGSI_EXEC_CC_UN) << shift) & mask;
1572         for (i = 0; i < QUAD_SIZE; i++)
1573            if (cc->u[i] & test)
1574               execmask &= ~(1 << i);
1575         break;
1576
1577      case TGSI_CC_TR:
1578         break;
1579
1580      case TGSI_CC_FL:
1581         for (i = 0; i < QUAD_SIZE; i++)
1582            execmask &= ~(1 << i);
1583         break;
1584
1585      default:
1586         assert( 0 );
1587         return;
1588      }
1589   }
1590
1591   switch (inst->Instruction.Saturate) {
1592   case TGSI_SAT_NONE:
1593      for (i = 0; i < QUAD_SIZE; i++)
1594         if (execmask & (1 << i))
1595            dst->i[i] = chan->i[i];
1596      break;
1597
1598   case TGSI_SAT_ZERO_ONE:
1599      for (i = 0; i < QUAD_SIZE; i++)
1600         if (execmask & (1 << i)) {
1601            if (chan->f[i] < 0.0f)
1602               dst->f[i] = 0.0f;
1603            else if (chan->f[i] > 1.0f)
1604               dst->f[i] = 1.0f;
1605            else
1606               dst->i[i] = chan->i[i];
1607         }
1608      break;
1609
1610   case TGSI_SAT_MINUS_PLUS_ONE:
1611      for (i = 0; i < QUAD_SIZE; i++)
1612         if (execmask & (1 << i)) {
1613            if (chan->f[i] < -1.0f)
1614               dst->f[i] = -1.0f;
1615            else if (chan->f[i] > 1.0f)
1616               dst->f[i] = 1.0f;
1617            else
1618               dst->i[i] = chan->i[i];
1619         }
1620      break;
1621
1622   default:
1623      assert( 0 );
1624   }
1625
1626   if (inst->InstructionExtNv.CondDstUpdate) {
1627      union tgsi_exec_channel *cc = &mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C];
1628      uint shift;
1629      uint mask;
1630
1631      /* Only CC0 supported.
1632       */
1633      assert( inst->InstructionExtNv.CondDstIndex < 1 );
1634
1635      switch (chan_index) {
1636      case CHAN_X:
1637         shift = TGSI_EXEC_CC_X_SHIFT;
1638         mask = ~TGSI_EXEC_CC_X_MASK;
1639         break;
1640      case CHAN_Y:
1641         shift = TGSI_EXEC_CC_Y_SHIFT;
1642         mask = ~TGSI_EXEC_CC_Y_MASK;
1643         break;
1644      case CHAN_Z:
1645         shift = TGSI_EXEC_CC_Z_SHIFT;
1646         mask = ~TGSI_EXEC_CC_Z_MASK;
1647         break;
1648      case CHAN_W:
1649         shift = TGSI_EXEC_CC_W_SHIFT;
1650         mask = ~TGSI_EXEC_CC_W_MASK;
1651         break;
1652      default:
1653         assert( 0 );
1654         return;
1655      }
1656
1657      for (i = 0; i < QUAD_SIZE; i++)
1658         if (execmask & (1 << i)) {
1659            cc->u[i] &= mask;
1660            if (dst->f[i] < 0.0f)
1661               cc->u[i] |= TGSI_EXEC_CC_LT << shift;
1662            else if (dst->f[i] > 0.0f)
1663               cc->u[i] |= TGSI_EXEC_CC_GT << shift;
1664            else if (dst->f[i] == 0.0f)
1665               cc->u[i] |= TGSI_EXEC_CC_EQ << shift;
1666            else
1667               cc->u[i] |= TGSI_EXEC_CC_UN << shift;
1668         }
1669   }
1670}
1671
1672#define FETCH(VAL,INDEX,CHAN)\
1673    fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
1674
1675#define STORE(VAL,INDEX,CHAN)\
1676    store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
1677
1678
1679/**
1680 * Execute ARB-style KIL which is predicated by a src register.
1681 * Kill fragment if any of the four values is less than zero.
1682 */
1683static void
1684exec_kil(struct tgsi_exec_machine *mach,
1685         const struct tgsi_full_instruction *inst)
1686{
1687   uint uniquemask;
1688   uint chan_index;
1689   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1690   union tgsi_exec_channel r[1];
1691
1692   /* This mask stores component bits that were already tested. Note that
1693    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1694    * tested. */
1695   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1696
1697   for (chan_index = 0; chan_index < 4; chan_index++)
1698   {
1699      uint swizzle;
1700      uint i;
1701
1702      /* unswizzle channel */
1703      swizzle = tgsi_util_get_full_src_register_extswizzle (
1704                        &inst->FullSrcRegisters[0],
1705                        chan_index);
1706
1707      /* check if the component has not been already tested */
1708      if (uniquemask & (1 << swizzle))
1709         continue;
1710      uniquemask |= 1 << swizzle;
1711
1712      FETCH(&r[0], 0, chan_index);
1713      for (i = 0; i < 4; i++)
1714         if (r[0].f[i] < 0.0f)
1715            kilmask |= 1 << i;
1716   }
1717
1718   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1719}
1720
1721/**
1722 * Execute NVIDIA-style KIL which is predicated by a condition code.
1723 * Kill fragment if the condition code is TRUE.
1724 */
1725static void
1726exec_kilp(struct tgsi_exec_machine *mach,
1727          const struct tgsi_full_instruction *inst)
1728{
1729   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1730
1731   if (inst->InstructionExtNv.CondFlowEnable) {
1732      uint swizzle[4];
1733      uint chan_index;
1734
1735      kilmask = 0x0;
1736
1737      swizzle[0] = inst->InstructionExtNv.CondSwizzleX;
1738      swizzle[1] = inst->InstructionExtNv.CondSwizzleY;
1739      swizzle[2] = inst->InstructionExtNv.CondSwizzleZ;
1740      swizzle[3] = inst->InstructionExtNv.CondSwizzleW;
1741
1742      for (chan_index = 0; chan_index < 4; chan_index++)
1743      {
1744         uint i;
1745
1746         for (i = 0; i < 4; i++) {
1747            /* TODO: evaluate the condition code */
1748            if (0)
1749               kilmask |= 1 << i;
1750         }
1751      }
1752   }
1753   else {
1754      /* "unconditional" kil */
1755      kilmask = mach->ExecMask;
1756   }
1757   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1758}
1759
1760
1761/*
1762 * Fetch a four texture samples using STR texture coordinates.
1763 */
1764static void
1765fetch_texel( struct tgsi_sampler *sampler,
1766             const union tgsi_exec_channel *s,
1767             const union tgsi_exec_channel *t,
1768             const union tgsi_exec_channel *p,
1769             float lodbias,  /* XXX should be float[4] */
1770             union tgsi_exec_channel *r,
1771             union tgsi_exec_channel *g,
1772             union tgsi_exec_channel *b,
1773             union tgsi_exec_channel *a )
1774{
1775   uint j;
1776   float rgba[NUM_CHANNELS][QUAD_SIZE];
1777
1778   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
1779
1780   for (j = 0; j < 4; j++) {
1781      r->f[j] = rgba[0][j];
1782      g->f[j] = rgba[1][j];
1783      b->f[j] = rgba[2][j];
1784      a->f[j] = rgba[3][j];
1785   }
1786}
1787
1788
1789static void
1790exec_tex(struct tgsi_exec_machine *mach,
1791         const struct tgsi_full_instruction *inst,
1792         boolean biasLod,
1793         boolean projected)
1794{
1795   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
1796   union tgsi_exec_channel r[4];
1797   uint chan_index;
1798   float lodBias;
1799
1800   /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
1801
1802   switch (inst->InstructionExtTexture.Texture) {
1803   case TGSI_TEXTURE_1D:
1804   case TGSI_TEXTURE_SHADOW1D:
1805
1806      FETCH(&r[0], 0, CHAN_X);
1807
1808      if (projected) {
1809         FETCH(&r[1], 0, CHAN_W);
1810         micro_div( &r[0], &r[0], &r[1] );
1811      }
1812
1813      if (biasLod) {
1814         FETCH(&r[1], 0, CHAN_W);
1815         lodBias = r[2].f[0];
1816      }
1817      else
1818         lodBias = 0.0;
1819
1820      fetch_texel(mach->Samplers[unit],
1821                  &r[0], &ZeroVec, &ZeroVec, lodBias,  /* S, T, P, BIAS */
1822                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
1823      break;
1824
1825   case TGSI_TEXTURE_2D:
1826   case TGSI_TEXTURE_RECT:
1827   case TGSI_TEXTURE_SHADOW2D:
1828   case TGSI_TEXTURE_SHADOWRECT:
1829
1830      FETCH(&r[0], 0, CHAN_X);
1831      FETCH(&r[1], 0, CHAN_Y);
1832      FETCH(&r[2], 0, CHAN_Z);
1833
1834      if (projected) {
1835         FETCH(&r[3], 0, CHAN_W);
1836         micro_div( &r[0], &r[0], &r[3] );
1837         micro_div( &r[1], &r[1], &r[3] );
1838         micro_div( &r[2], &r[2], &r[3] );
1839      }
1840
1841      if (biasLod) {
1842         FETCH(&r[3], 0, CHAN_W);
1843         lodBias = r[3].f[0];
1844      }
1845      else
1846         lodBias = 0.0;
1847
1848      fetch_texel(mach->Samplers[unit],
1849                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
1850                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1851      break;
1852
1853   case TGSI_TEXTURE_3D:
1854   case TGSI_TEXTURE_CUBE:
1855
1856      FETCH(&r[0], 0, CHAN_X);
1857      FETCH(&r[1], 0, CHAN_Y);
1858      FETCH(&r[2], 0, CHAN_Z);
1859
1860      if (projected) {
1861         FETCH(&r[3], 0, CHAN_W);
1862         micro_div( &r[0], &r[0], &r[3] );
1863         micro_div( &r[1], &r[1], &r[3] );
1864         micro_div( &r[2], &r[2], &r[3] );
1865      }
1866
1867      if (biasLod) {
1868         FETCH(&r[3], 0, CHAN_W);
1869         lodBias = r[3].f[0];
1870      }
1871      else
1872         lodBias = 0.0;
1873
1874      fetch_texel(mach->Samplers[unit],
1875                  &r[0], &r[1], &r[2], lodBias,
1876                  &r[0], &r[1], &r[2], &r[3]);
1877      break;
1878
1879   default:
1880      assert (0);
1881   }
1882
1883   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1884      STORE( &r[chan_index], 0, chan_index );
1885   }
1886}
1887
1888
1889/**
1890 * Evaluate a constant-valued coefficient at the position of the
1891 * current quad.
1892 */
1893static void
1894eval_constant_coef(
1895   struct tgsi_exec_machine *mach,
1896   unsigned attrib,
1897   unsigned chan )
1898{
1899   unsigned i;
1900
1901   for( i = 0; i < QUAD_SIZE; i++ ) {
1902      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1903   }
1904}
1905
1906/**
1907 * Evaluate a linear-valued coefficient at the position of the
1908 * current quad.
1909 */
1910static void
1911eval_linear_coef(
1912   struct tgsi_exec_machine *mach,
1913   unsigned attrib,
1914   unsigned chan )
1915{
1916   const float x = mach->QuadPos.xyzw[0].f[0];
1917   const float y = mach->QuadPos.xyzw[1].f[0];
1918   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1919   const float dady = mach->InterpCoefs[attrib].dady[chan];
1920   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1921   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1922   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1923   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1924   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1925}
1926
1927/**
1928 * Evaluate a perspective-valued coefficient at the position of the
1929 * current quad.
1930 */
1931static void
1932eval_perspective_coef(
1933   struct tgsi_exec_machine *mach,
1934   unsigned attrib,
1935   unsigned chan )
1936{
1937   const float x = mach->QuadPos.xyzw[0].f[0];
1938   const float y = mach->QuadPos.xyzw[1].f[0];
1939   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1940   const float dady = mach->InterpCoefs[attrib].dady[chan];
1941   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1942   const float *w = mach->QuadPos.xyzw[3].f;
1943   /* divide by W here */
1944   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1945   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1946   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1947   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1948}
1949
1950
1951typedef void (* eval_coef_func)(
1952   struct tgsi_exec_machine *mach,
1953   unsigned attrib,
1954   unsigned chan );
1955
1956static void
1957exec_declaration(
1958   struct tgsi_exec_machine *mach,
1959   const struct tgsi_full_declaration *decl )
1960{
1961   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
1962      if( decl->Declaration.File == TGSI_FILE_INPUT ) {
1963         unsigned first, last, mask;
1964         eval_coef_func eval;
1965
1966         first = decl->DeclarationRange.First;
1967         last = decl->DeclarationRange.Last;
1968         mask = decl->Declaration.UsageMask;
1969
1970         switch( decl->Declaration.Interpolate ) {
1971         case TGSI_INTERPOLATE_CONSTANT:
1972            eval = eval_constant_coef;
1973            break;
1974
1975         case TGSI_INTERPOLATE_LINEAR:
1976            eval = eval_linear_coef;
1977            break;
1978
1979         case TGSI_INTERPOLATE_PERSPECTIVE:
1980            eval = eval_perspective_coef;
1981            break;
1982
1983         default:
1984            eval = NULL;
1985            assert( 0 );
1986         }
1987
1988         if( mask == TGSI_WRITEMASK_XYZW ) {
1989            unsigned i, j;
1990
1991            for( i = first; i <= last; i++ ) {
1992               for( j = 0; j < NUM_CHANNELS; j++ ) {
1993                  eval( mach, i, j );
1994               }
1995            }
1996         }
1997         else {
1998            unsigned i, j;
1999
2000            for( j = 0; j < NUM_CHANNELS; j++ ) {
2001               if( mask & (1 << j) ) {
2002                  for( i = first; i <= last; i++ ) {
2003                     eval( mach, i, j );
2004                  }
2005               }
2006            }
2007         }
2008      }
2009   }
2010}
2011
2012static void
2013exec_instruction(
2014   struct tgsi_exec_machine *mach,
2015   const struct tgsi_full_instruction *inst,
2016   int *pc )
2017{
2018   uint chan_index;
2019   union tgsi_exec_channel r[10];
2020
2021   (*pc)++;
2022
2023   switch (inst->Instruction.Opcode) {
2024   case TGSI_OPCODE_ARL:
2025   case TGSI_OPCODE_FLR:
2026      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2027         FETCH( &r[0], 0, chan_index );
2028         micro_flr( &r[0], &r[0] );
2029         STORE( &r[0], 0, chan_index );
2030      }
2031      break;
2032
2033   case TGSI_OPCODE_MOV:
2034   case TGSI_OPCODE_SWZ:
2035      if (inst->Flags & SOA_DEPENDENCY_FLAG) {
2036         /* Do all fetches into temp regs, then do all stores to avoid
2037          * intermediate/accidental clobbering.  This could be done all the
2038          * time for MOV but for other instructions we'll need more temps...
2039          */
2040         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2041            FETCH( &r[chan_index], 0, chan_index );
2042         }
2043         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2044            STORE( &r[chan_index], 0, chan_index );
2045         }
2046      }
2047      else {
2048         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2049            FETCH( &r[0], 0, chan_index );
2050            STORE( &r[0], 0, chan_index );
2051         }
2052      }
2053      break;
2054
2055   case TGSI_OPCODE_LIT:
2056      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2057         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2058      }
2059
2060      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2061         FETCH( &r[0], 0, CHAN_X );
2062         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2063            micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2064            STORE( &r[0], 0, CHAN_Y );
2065         }
2066
2067         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2068            FETCH( &r[1], 0, CHAN_Y );
2069            micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2070
2071            FETCH( &r[2], 0, CHAN_W );
2072            micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
2073            micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
2074            micro_pow( &r[1], &r[1], &r[2] );
2075            micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2076            STORE( &r[0], 0, CHAN_Z );
2077         }
2078      }
2079
2080      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2081         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2082      }
2083      break;
2084
2085   case TGSI_OPCODE_RCP:
2086   /* TGSI_OPCODE_RECIP */
2087      FETCH( &r[0], 0, CHAN_X );
2088      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
2089      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2090         STORE( &r[0], 0, chan_index );
2091      }
2092      break;
2093
2094   case TGSI_OPCODE_RSQ:
2095   /* TGSI_OPCODE_RECIPSQRT */
2096      FETCH( &r[0], 0, CHAN_X );
2097      micro_abs( &r[0], &r[0] );
2098      micro_sqrt( &r[0], &r[0] );
2099      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
2100      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2101         STORE( &r[0], 0, chan_index );
2102      }
2103      break;
2104
2105   case TGSI_OPCODE_EXP:
2106      FETCH( &r[0], 0, CHAN_X );
2107      micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
2108      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2109         micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
2110         STORE( &r[2], 0, CHAN_X );        /* store r2 */
2111      }
2112      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2113         micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
2114         STORE( &r[2], 0, CHAN_Y );        /* store r2 */
2115      }
2116      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2117         micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
2118         STORE( &r[2], 0, CHAN_Z );        /* store r2 */
2119      }
2120      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2121         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2122      }
2123      break;
2124
2125   case TGSI_OPCODE_LOG:
2126      FETCH( &r[0], 0, CHAN_X );
2127      micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
2128      micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
2129      micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
2130      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2131         STORE( &r[0], 0, CHAN_X );
2132      }
2133      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2134         micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
2135         micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
2136         STORE( &r[0], 0, CHAN_Y );
2137      }
2138      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2139         STORE( &r[1], 0, CHAN_Z );
2140      }
2141      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2142         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2143      }
2144      break;
2145
2146   case TGSI_OPCODE_MUL:
2147      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
2148      {
2149         FETCH(&r[0], 0, chan_index);
2150         FETCH(&r[1], 1, chan_index);
2151
2152         micro_mul( &r[0], &r[0], &r[1] );
2153
2154         STORE(&r[0], 0, chan_index);
2155      }
2156      break;
2157
2158   case TGSI_OPCODE_ADD:
2159      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2160         FETCH( &r[0], 0, chan_index );
2161         FETCH( &r[1], 1, chan_index );
2162         micro_add( &r[0], &r[0], &r[1] );
2163         STORE( &r[0], 0, chan_index );
2164      }
2165      break;
2166
2167   case TGSI_OPCODE_DP3:
2168   /* TGSI_OPCODE_DOT3 */
2169      FETCH( &r[0], 0, CHAN_X );
2170      FETCH( &r[1], 1, CHAN_X );
2171      micro_mul( &r[0], &r[0], &r[1] );
2172
2173      FETCH( &r[1], 0, CHAN_Y );
2174      FETCH( &r[2], 1, CHAN_Y );
2175      micro_mul( &r[1], &r[1], &r[2] );
2176      micro_add( &r[0], &r[0], &r[1] );
2177
2178      FETCH( &r[1], 0, CHAN_Z );
2179      FETCH( &r[2], 1, CHAN_Z );
2180      micro_mul( &r[1], &r[1], &r[2] );
2181      micro_add( &r[0], &r[0], &r[1] );
2182
2183      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2184         STORE( &r[0], 0, chan_index );
2185      }
2186      break;
2187
2188    case TGSI_OPCODE_DP4:
2189    /* TGSI_OPCODE_DOT4 */
2190       FETCH(&r[0], 0, CHAN_X);
2191       FETCH(&r[1], 1, CHAN_X);
2192
2193       micro_mul( &r[0], &r[0], &r[1] );
2194
2195       FETCH(&r[1], 0, CHAN_Y);
2196       FETCH(&r[2], 1, CHAN_Y);
2197
2198       micro_mul( &r[1], &r[1], &r[2] );
2199       micro_add( &r[0], &r[0], &r[1] );
2200
2201       FETCH(&r[1], 0, CHAN_Z);
2202       FETCH(&r[2], 1, CHAN_Z);
2203
2204       micro_mul( &r[1], &r[1], &r[2] );
2205       micro_add( &r[0], &r[0], &r[1] );
2206
2207       FETCH(&r[1], 0, CHAN_W);
2208       FETCH(&r[2], 1, CHAN_W);
2209
2210       micro_mul( &r[1], &r[1], &r[2] );
2211       micro_add( &r[0], &r[0], &r[1] );
2212
2213      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2214         STORE( &r[0], 0, chan_index );
2215      }
2216      break;
2217
2218   case TGSI_OPCODE_DST:
2219      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2220         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2221      }
2222
2223      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2224         FETCH( &r[0], 0, CHAN_Y );
2225         FETCH( &r[1], 1, CHAN_Y);
2226         micro_mul( &r[0], &r[0], &r[1] );
2227         STORE( &r[0], 0, CHAN_Y );
2228      }
2229
2230      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2231         FETCH( &r[0], 0, CHAN_Z );
2232         STORE( &r[0], 0, CHAN_Z );
2233      }
2234
2235      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2236         FETCH( &r[0], 1, CHAN_W );
2237         STORE( &r[0], 0, CHAN_W );
2238      }
2239      break;
2240
2241   case TGSI_OPCODE_MIN:
2242      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2243         FETCH(&r[0], 0, chan_index);
2244         FETCH(&r[1], 1, chan_index);
2245
2246         /* XXX use micro_min()?? */
2247         micro_lt( &r[0], &r[0], &r[1], &r[0], &r[1] );
2248
2249         STORE(&r[0], 0, chan_index);
2250      }
2251      break;
2252
2253   case TGSI_OPCODE_MAX:
2254      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2255         FETCH(&r[0], 0, chan_index);
2256         FETCH(&r[1], 1, chan_index);
2257
2258         /* XXX use micro_max()?? */
2259         micro_lt( &r[0], &r[0], &r[1], &r[1], &r[0] );
2260
2261         STORE(&r[0], 0, chan_index );
2262      }
2263      break;
2264
2265   case TGSI_OPCODE_SLT:
2266   /* TGSI_OPCODE_SETLT */
2267      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2268         FETCH( &r[0], 0, chan_index );
2269         FETCH( &r[1], 1, chan_index );
2270         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2271         STORE( &r[0], 0, chan_index );
2272      }
2273      break;
2274
2275   case TGSI_OPCODE_SGE:
2276   /* TGSI_OPCODE_SETGE */
2277      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2278         FETCH( &r[0], 0, chan_index );
2279         FETCH( &r[1], 1, chan_index );
2280         micro_le( &r[0], &r[1], &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2281         STORE( &r[0], 0, chan_index );
2282      }
2283      break;
2284
2285   case TGSI_OPCODE_MAD:
2286   /* TGSI_OPCODE_MADD */
2287      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2288         FETCH( &r[0], 0, chan_index );
2289         FETCH( &r[1], 1, chan_index );
2290         micro_mul( &r[0], &r[0], &r[1] );
2291         FETCH( &r[1], 2, chan_index );
2292         micro_add( &r[0], &r[0], &r[1] );
2293         STORE( &r[0], 0, chan_index );
2294      }
2295      break;
2296
2297   case TGSI_OPCODE_SUB:
2298      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2299         FETCH(&r[0], 0, chan_index);
2300         FETCH(&r[1], 1, chan_index);
2301
2302         micro_sub( &r[0], &r[0], &r[1] );
2303
2304         STORE(&r[0], 0, chan_index);
2305      }
2306      break;
2307
2308   case TGSI_OPCODE_LRP:
2309      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2310         FETCH(&r[0], 0, chan_index);
2311         FETCH(&r[1], 1, chan_index);
2312         FETCH(&r[2], 2, chan_index);
2313
2314         micro_sub( &r[1], &r[1], &r[2] );
2315         micro_mul( &r[0], &r[0], &r[1] );
2316         micro_add( &r[0], &r[0], &r[2] );
2317
2318         STORE(&r[0], 0, chan_index);
2319      }
2320      break;
2321
2322   case TGSI_OPCODE_CND:
2323      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2324         FETCH(&r[0], 0, chan_index);
2325         FETCH(&r[1], 1, chan_index);
2326         FETCH(&r[2], 2, chan_index);
2327         micro_lt(&r[0], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]);
2328         STORE(&r[0], 0, chan_index);
2329      }
2330      break;
2331
2332   case TGSI_OPCODE_DP2A:
2333      FETCH( &r[0], 0, CHAN_X );
2334      FETCH( &r[1], 1, CHAN_X );
2335      micro_mul( &r[0], &r[0], &r[1] );
2336
2337      FETCH( &r[1], 0, CHAN_Y );
2338      FETCH( &r[2], 1, CHAN_Y );
2339      micro_mul( &r[1], &r[1], &r[2] );
2340      micro_add( &r[0], &r[0], &r[1] );
2341
2342      FETCH( &r[2], 2, CHAN_X );
2343      micro_add( &r[0], &r[0], &r[2] );
2344
2345      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2346         STORE( &r[0], 0, chan_index );
2347      }
2348      break;
2349
2350   case TGSI_OPCODE_FRC:
2351      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2352         FETCH( &r[0], 0, chan_index );
2353         micro_frc( &r[0], &r[0] );
2354         STORE( &r[0], 0, chan_index );
2355      }
2356      break;
2357
2358   case TGSI_OPCODE_CLAMP:
2359      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2360         FETCH(&r[0], 0, chan_index);
2361         FETCH(&r[1], 1, chan_index);
2362         micro_max(&r[0], &r[0], &r[1]);
2363         FETCH(&r[1], 2, chan_index);
2364         micro_min(&r[0], &r[0], &r[1]);
2365         STORE(&r[0], 0, chan_index);
2366      }
2367      break;
2368
2369   case TGSI_OPCODE_ROUND:
2370   case TGSI_OPCODE_ARR:
2371      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2372         FETCH( &r[0], 0, chan_index );
2373         micro_rnd( &r[0], &r[0] );
2374         STORE( &r[0], 0, chan_index );
2375      }
2376      break;
2377
2378   case TGSI_OPCODE_EX2:
2379      FETCH(&r[0], 0, CHAN_X);
2380
2381#if FAST_MATH
2382      micro_exp2( &r[0], &r[0] );
2383#else
2384      micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] );
2385#endif
2386
2387      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2388         STORE( &r[0], 0, chan_index );
2389      }
2390      break;
2391
2392   case TGSI_OPCODE_LG2:
2393      FETCH( &r[0], 0, CHAN_X );
2394      micro_lg2( &r[0], &r[0] );
2395      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2396         STORE( &r[0], 0, chan_index );
2397      }
2398      break;
2399
2400   case TGSI_OPCODE_POW:
2401      FETCH(&r[0], 0, CHAN_X);
2402      FETCH(&r[1], 1, CHAN_X);
2403
2404      micro_pow( &r[0], &r[0], &r[1] );
2405
2406      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2407         STORE( &r[0], 0, chan_index );
2408      }
2409      break;
2410
2411   case TGSI_OPCODE_XPD:
2412      FETCH(&r[0], 0, CHAN_Y);
2413      FETCH(&r[1], 1, CHAN_Z);
2414
2415      micro_mul( &r[2], &r[0], &r[1] );
2416
2417      FETCH(&r[3], 0, CHAN_Z);
2418      FETCH(&r[4], 1, CHAN_Y);
2419
2420      micro_mul( &r[5], &r[3], &r[4] );
2421      micro_sub( &r[2], &r[2], &r[5] );
2422
2423      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2424         STORE( &r[2], 0, CHAN_X );
2425      }
2426
2427      FETCH(&r[2], 1, CHAN_X);
2428
2429      micro_mul( &r[3], &r[3], &r[2] );
2430
2431      FETCH(&r[5], 0, CHAN_X);
2432
2433      micro_mul( &r[1], &r[1], &r[5] );
2434      micro_sub( &r[3], &r[3], &r[1] );
2435
2436      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2437         STORE( &r[3], 0, CHAN_Y );
2438      }
2439
2440      micro_mul( &r[5], &r[5], &r[4] );
2441      micro_mul( &r[0], &r[0], &r[2] );
2442      micro_sub( &r[5], &r[5], &r[0] );
2443
2444      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2445         STORE( &r[5], 0, CHAN_Z );
2446      }
2447
2448      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2449         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2450      }
2451      break;
2452
2453    case TGSI_OPCODE_ABS:
2454       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2455          FETCH(&r[0], 0, chan_index);
2456
2457          micro_abs( &r[0], &r[0] );
2458
2459          STORE(&r[0], 0, chan_index);
2460       }
2461       break;
2462
2463   case TGSI_OPCODE_RCC:
2464      FETCH(&r[0], 0, CHAN_X);
2465      micro_div(&r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0]);
2466      micro_float_clamp(&r[0], &r[0]);
2467      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2468         STORE(&r[0], 0, chan_index);
2469      }
2470      break;
2471
2472   case TGSI_OPCODE_DPH:
2473      FETCH(&r[0], 0, CHAN_X);
2474      FETCH(&r[1], 1, CHAN_X);
2475
2476      micro_mul( &r[0], &r[0], &r[1] );
2477
2478      FETCH(&r[1], 0, CHAN_Y);
2479      FETCH(&r[2], 1, CHAN_Y);
2480
2481      micro_mul( &r[1], &r[1], &r[2] );
2482      micro_add( &r[0], &r[0], &r[1] );
2483
2484      FETCH(&r[1], 0, CHAN_Z);
2485      FETCH(&r[2], 1, CHAN_Z);
2486
2487      micro_mul( &r[1], &r[1], &r[2] );
2488      micro_add( &r[0], &r[0], &r[1] );
2489
2490      FETCH(&r[1], 1, CHAN_W);
2491
2492      micro_add( &r[0], &r[0], &r[1] );
2493
2494      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2495         STORE( &r[0], 0, chan_index );
2496      }
2497      break;
2498
2499   case TGSI_OPCODE_COS:
2500      FETCH(&r[0], 0, CHAN_X);
2501
2502      micro_cos( &r[0], &r[0] );
2503
2504      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2505         STORE( &r[0], 0, chan_index );
2506      }
2507      break;
2508
2509   case TGSI_OPCODE_DDX:
2510      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2511         FETCH( &r[0], 0, chan_index );
2512         micro_ddx( &r[0], &r[0] );
2513         STORE( &r[0], 0, chan_index );
2514      }
2515      break;
2516
2517   case TGSI_OPCODE_DDY:
2518      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2519         FETCH( &r[0], 0, chan_index );
2520         micro_ddy( &r[0], &r[0] );
2521         STORE( &r[0], 0, chan_index );
2522      }
2523      break;
2524
2525   case TGSI_OPCODE_KILP:
2526      exec_kilp (mach, inst);
2527      break;
2528
2529   case TGSI_OPCODE_KIL:
2530      exec_kil (mach, inst);
2531      break;
2532
2533   case TGSI_OPCODE_PK2H:
2534      assert (0);
2535      break;
2536
2537   case TGSI_OPCODE_PK2US:
2538      assert (0);
2539      break;
2540
2541   case TGSI_OPCODE_PK4B:
2542      assert (0);
2543      break;
2544
2545   case TGSI_OPCODE_PK4UB:
2546      assert (0);
2547      break;
2548
2549   case TGSI_OPCODE_RFL:
2550      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2551          IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2552          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2553         /* r0 = dp3(src0, src0) */
2554         FETCH(&r[2], 0, CHAN_X);
2555         micro_mul(&r[0], &r[2], &r[2]);
2556         FETCH(&r[4], 0, CHAN_Y);
2557         micro_mul(&r[8], &r[4], &r[4]);
2558         micro_add(&r[0], &r[0], &r[8]);
2559         FETCH(&r[6], 0, CHAN_Z);
2560         micro_mul(&r[8], &r[6], &r[6]);
2561         micro_add(&r[0], &r[0], &r[8]);
2562
2563         /* r1 = dp3(src0, src1) */
2564         FETCH(&r[3], 1, CHAN_X);
2565         micro_mul(&r[1], &r[2], &r[3]);
2566         FETCH(&r[5], 1, CHAN_Y);
2567         micro_mul(&r[8], &r[4], &r[5]);
2568         micro_add(&r[1], &r[1], &r[8]);
2569         FETCH(&r[7], 1, CHAN_Z);
2570         micro_mul(&r[8], &r[6], &r[7]);
2571         micro_add(&r[1], &r[1], &r[8]);
2572
2573         /* r1 = 2 * r1 / r0 */
2574         micro_add(&r[1], &r[1], &r[1]);
2575         micro_div(&r[1], &r[1], &r[0]);
2576
2577         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2578            micro_mul(&r[2], &r[2], &r[1]);
2579            micro_sub(&r[2], &r[2], &r[3]);
2580            STORE(&r[2], 0, CHAN_X);
2581         }
2582         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2583            micro_mul(&r[4], &r[4], &r[1]);
2584            micro_sub(&r[4], &r[4], &r[5]);
2585            STORE(&r[4], 0, CHAN_Y);
2586         }
2587         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2588            micro_mul(&r[6], &r[6], &r[1]);
2589            micro_sub(&r[6], &r[6], &r[7]);
2590            STORE(&r[6], 0, CHAN_Z);
2591         }
2592      }
2593      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2594         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2595      }
2596      break;
2597
2598   case TGSI_OPCODE_SEQ:
2599      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2600         FETCH( &r[0], 0, chan_index );
2601         FETCH( &r[1], 1, chan_index );
2602         micro_eq( &r[0], &r[0], &r[1],
2603                   &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C],
2604                   &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2605         STORE( &r[0], 0, chan_index );
2606      }
2607      break;
2608
2609   case TGSI_OPCODE_SFL:
2610      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2611         STORE(&mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, chan_index);
2612      }
2613      break;
2614
2615   case TGSI_OPCODE_SGT:
2616      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2617         FETCH( &r[0], 0, chan_index );
2618         FETCH( &r[1], 1, chan_index );
2619         micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
2620         STORE( &r[0], 0, chan_index );
2621      }
2622      break;
2623
2624   case TGSI_OPCODE_SIN:
2625      FETCH( &r[0], 0, CHAN_X );
2626      micro_sin( &r[0], &r[0] );
2627      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2628         STORE( &r[0], 0, chan_index );
2629      }
2630      break;
2631
2632   case TGSI_OPCODE_SLE:
2633      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2634         FETCH( &r[0], 0, chan_index );
2635         FETCH( &r[1], 1, chan_index );
2636         micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2637         STORE( &r[0], 0, chan_index );
2638      }
2639      break;
2640
2641   case TGSI_OPCODE_SNE:
2642      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2643         FETCH( &r[0], 0, chan_index );
2644         FETCH( &r[1], 1, chan_index );
2645         micro_eq( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
2646         STORE( &r[0], 0, chan_index );
2647      }
2648      break;
2649
2650   case TGSI_OPCODE_STR:
2651      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2652         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, chan_index);
2653      }
2654      break;
2655
2656   case TGSI_OPCODE_TEX:
2657      /* simple texture lookup */
2658      /* src[0] = texcoord */
2659      /* src[1] = sampler unit */
2660      exec_tex(mach, inst, FALSE, FALSE);
2661      break;
2662
2663   case TGSI_OPCODE_TXB:
2664      /* Texture lookup with lod bias */
2665      /* src[0] = texcoord (src[0].w = LOD bias) */
2666      /* src[1] = sampler unit */
2667      exec_tex(mach, inst, TRUE, FALSE);
2668      break;
2669
2670   case TGSI_OPCODE_TXD:
2671      /* Texture lookup with explict partial derivatives */
2672      /* src[0] = texcoord */
2673      /* src[1] = d[strq]/dx */
2674      /* src[2] = d[strq]/dy */
2675      /* src[3] = sampler unit */
2676      assert (0);
2677      break;
2678
2679   case TGSI_OPCODE_TXL:
2680      /* Texture lookup with explit LOD */
2681      /* src[0] = texcoord (src[0].w = LOD) */
2682      /* src[1] = sampler unit */
2683      exec_tex(mach, inst, TRUE, FALSE);
2684      break;
2685
2686   case TGSI_OPCODE_TXP:
2687      /* Texture lookup with projection */
2688      /* src[0] = texcoord (src[0].w = projection) */
2689      /* src[1] = sampler unit */
2690      exec_tex(mach, inst, FALSE, TRUE);
2691      break;
2692
2693   case TGSI_OPCODE_UP2H:
2694      assert (0);
2695      break;
2696
2697   case TGSI_OPCODE_UP2US:
2698      assert (0);
2699      break;
2700
2701   case TGSI_OPCODE_UP4B:
2702      assert (0);
2703      break;
2704
2705   case TGSI_OPCODE_UP4UB:
2706      assert (0);
2707      break;
2708
2709   case TGSI_OPCODE_X2D:
2710      FETCH(&r[0], 1, CHAN_X);
2711      FETCH(&r[1], 1, CHAN_Y);
2712      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2713          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2714         FETCH(&r[2], 2, CHAN_X);
2715         micro_mul(&r[2], &r[2], &r[0]);
2716         FETCH(&r[3], 2, CHAN_Y);
2717         micro_mul(&r[3], &r[3], &r[1]);
2718         micro_add(&r[2], &r[2], &r[3]);
2719         FETCH(&r[3], 0, CHAN_X);
2720         micro_add(&r[2], &r[2], &r[3]);
2721         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2722            STORE(&r[2], 0, CHAN_X);
2723         }
2724         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2725            STORE(&r[2], 0, CHAN_Z);
2726         }
2727      }
2728      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2729          IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2730         FETCH(&r[2], 2, CHAN_Z);
2731         micro_mul(&r[2], &r[2], &r[0]);
2732         FETCH(&r[3], 2, CHAN_W);
2733         micro_mul(&r[3], &r[3], &r[1]);
2734         micro_add(&r[2], &r[2], &r[3]);
2735         FETCH(&r[3], 0, CHAN_Y);
2736         micro_add(&r[2], &r[2], &r[3]);
2737         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2738            STORE(&r[2], 0, CHAN_Y);
2739         }
2740         if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2741            STORE(&r[2], 0, CHAN_W);
2742         }
2743      }
2744      break;
2745
2746   case TGSI_OPCODE_ARA:
2747      assert (0);
2748      break;
2749
2750   case TGSI_OPCODE_BRA:
2751      assert (0);
2752      break;
2753
2754   case TGSI_OPCODE_CAL:
2755      /* skip the call if no execution channels are enabled */
2756      if (mach->ExecMask) {
2757         /* do the call */
2758
2759         /* First, record the depths of the execution stacks.
2760          * This is important for deeply nested/looped return statements.
2761          * We have to unwind the stacks by the correct amount.  For a
2762          * real code generator, we could determine the number of entries
2763          * to pop off each stack with simple static analysis and avoid
2764          * implementing this data structure at run time.
2765          */
2766         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
2767         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
2768         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
2769         /* note that PC was already incremented above */
2770         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
2771
2772         mach->CallStackTop++;
2773
2774         /* Second, push the Cond, Loop, Cont, Func stacks */
2775         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2776         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2777         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2778         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2779         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2780         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2781         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
2782         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
2783
2784         /* Finally, jump to the subroutine */
2785         *pc = inst->InstructionExtLabel.Label;
2786      }
2787      break;
2788
2789   case TGSI_OPCODE_RET:
2790      mach->FuncMask &= ~mach->ExecMask;
2791      UPDATE_EXEC_MASK(mach);
2792
2793      if (mach->FuncMask == 0x0) {
2794         /* really return now (otherwise, keep executing */
2795
2796         if (mach->CallStackTop == 0) {
2797            /* returning from main() */
2798            *pc = -1;
2799            return;
2800         }
2801
2802         assert(mach->CallStackTop > 0);
2803         mach->CallStackTop--;
2804
2805         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
2806         mach->CondMask = mach->CondStack[mach->CondStackTop];
2807
2808         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
2809         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
2810
2811         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
2812         mach->ContMask = mach->ContStack[mach->ContStackTop];
2813
2814         assert(mach->FuncStackTop > 0);
2815         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
2816
2817         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
2818
2819         UPDATE_EXEC_MASK(mach);
2820      }
2821      break;
2822
2823   case TGSI_OPCODE_SSG:
2824   /* TGSI_OPCODE_SGN */
2825      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2826         FETCH( &r[0], 0, chan_index );
2827         micro_sgn( &r[0], &r[0] );
2828         STORE( &r[0], 0, chan_index );
2829      }
2830      break;
2831
2832   case TGSI_OPCODE_CMP:
2833      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2834         FETCH(&r[0], 0, chan_index);
2835         FETCH(&r[1], 1, chan_index);
2836         FETCH(&r[2], 2, chan_index);
2837
2838         micro_lt( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2] );
2839
2840         STORE(&r[0], 0, chan_index);
2841      }
2842      break;
2843
2844   case TGSI_OPCODE_SCS:
2845      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2846         FETCH( &r[0], 0, CHAN_X );
2847         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2848            micro_cos(&r[1], &r[0]);
2849            STORE(&r[1], 0, CHAN_X);
2850         }
2851         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2852            micro_sin(&r[1], &r[0]);
2853            STORE(&r[1], 0, CHAN_Y);
2854         }
2855      }
2856      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2857         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
2858      }
2859      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
2860         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2861      }
2862      break;
2863
2864   case TGSI_OPCODE_NRM:
2865      /* 3-component vector normalize */
2866      if(IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2867         IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2868         IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2869         /* r3 = sqrt(dp3(src0, src0)) */
2870         FETCH(&r[0], 0, CHAN_X);
2871         micro_mul(&r[3], &r[0], &r[0]);
2872         FETCH(&r[1], 0, CHAN_Y);
2873         micro_mul(&r[4], &r[1], &r[1]);
2874         micro_add(&r[3], &r[3], &r[4]);
2875         FETCH(&r[2], 0, CHAN_Z);
2876         micro_mul(&r[4], &r[2], &r[2]);
2877         micro_add(&r[3], &r[3], &r[4]);
2878         micro_sqrt(&r[3], &r[3]);
2879
2880         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2881            micro_div(&r[0], &r[0], &r[3]);
2882            STORE(&r[0], 0, CHAN_X);
2883         }
2884         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2885            micro_div(&r[1], &r[1], &r[3]);
2886            STORE(&r[1], 0, CHAN_Y);
2887         }
2888         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2889            micro_div(&r[2], &r[2], &r[3]);
2890            STORE(&r[2], 0, CHAN_Z);
2891         }
2892      }
2893      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2894         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2895      }
2896      break;
2897
2898   case TGSI_OPCODE_NRM4:
2899      /* 4-component vector normalize */
2900      {
2901         union tgsi_exec_channel tmp, dot;
2902
2903         /* tmp = dp4(src0, src0): */
2904         FETCH( &r[0], 0, CHAN_X );
2905         micro_mul( &tmp, &r[0], &r[0] );
2906
2907         FETCH( &r[1], 0, CHAN_Y );
2908         micro_mul( &dot, &r[1], &r[1] );
2909         micro_add( &tmp, &tmp, &dot );
2910
2911         FETCH( &r[2], 0, CHAN_Z );
2912         micro_mul( &dot, &r[2], &r[2] );
2913         micro_add( &tmp, &tmp, &dot );
2914
2915         FETCH( &r[3], 0, CHAN_W );
2916         micro_mul( &dot, &r[3], &r[3] );
2917         micro_add( &tmp, &tmp, &dot );
2918
2919         /* tmp = 1 / sqrt(tmp) */
2920         micro_sqrt( &tmp, &tmp );
2921         micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
2922
2923         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2924            /* chan = chan * tmp */
2925            micro_mul( &r[chan_index], &tmp, &r[chan_index] );
2926            STORE( &r[chan_index], 0, chan_index );
2927         }
2928      }
2929      break;
2930
2931   case TGSI_OPCODE_DIV:
2932      assert( 0 );
2933      break;
2934
2935   case TGSI_OPCODE_DP2:
2936      FETCH( &r[0], 0, CHAN_X );
2937      FETCH( &r[1], 1, CHAN_X );
2938      micro_mul( &r[0], &r[0], &r[1] );
2939
2940      FETCH( &r[1], 0, CHAN_Y );
2941      FETCH( &r[2], 1, CHAN_Y );
2942      micro_mul( &r[1], &r[1], &r[2] );
2943      micro_add( &r[0], &r[0], &r[1] );
2944
2945      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2946         STORE( &r[0], 0, chan_index );
2947      }
2948      break;
2949
2950   case TGSI_OPCODE_IF:
2951      /* push CondMask */
2952      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2953      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2954      FETCH( &r[0], 0, CHAN_X );
2955      /* update CondMask */
2956      if( ! r[0].u[0] ) {
2957         mach->CondMask &= ~0x1;
2958      }
2959      if( ! r[0].u[1] ) {
2960         mach->CondMask &= ~0x2;
2961      }
2962      if( ! r[0].u[2] ) {
2963         mach->CondMask &= ~0x4;
2964      }
2965      if( ! r[0].u[3] ) {
2966         mach->CondMask &= ~0x8;
2967      }
2968      UPDATE_EXEC_MASK(mach);
2969      /* Todo: If CondMask==0, jump to ELSE */
2970      break;
2971
2972   case TGSI_OPCODE_ELSE:
2973      /* invert CondMask wrt previous mask */
2974      {
2975         uint prevMask;
2976         assert(mach->CondStackTop > 0);
2977         prevMask = mach->CondStack[mach->CondStackTop - 1];
2978         mach->CondMask = ~mach->CondMask & prevMask;
2979         UPDATE_EXEC_MASK(mach);
2980         /* Todo: If CondMask==0, jump to ENDIF */
2981      }
2982      break;
2983
2984   case TGSI_OPCODE_ENDIF:
2985      /* pop CondMask */
2986      assert(mach->CondStackTop > 0);
2987      mach->CondMask = mach->CondStack[--mach->CondStackTop];
2988      UPDATE_EXEC_MASK(mach);
2989      break;
2990
2991   case TGSI_OPCODE_END:
2992      /* halt execution */
2993      *pc = -1;
2994      break;
2995
2996   case TGSI_OPCODE_REP:
2997      assert (0);
2998      break;
2999
3000   case TGSI_OPCODE_ENDREP:
3001       assert (0);
3002       break;
3003
3004   case TGSI_OPCODE_PUSHA:
3005      assert (0);
3006      break;
3007
3008   case TGSI_OPCODE_POPA:
3009      assert (0);
3010      break;
3011
3012   case TGSI_OPCODE_CEIL:
3013      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3014         FETCH( &r[0], 0, chan_index );
3015         micro_ceil( &r[0], &r[0] );
3016         STORE( &r[0], 0, chan_index );
3017      }
3018      break;
3019
3020   case TGSI_OPCODE_I2F:
3021      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3022         FETCH( &r[0], 0, chan_index );
3023         micro_i2f( &r[0], &r[0] );
3024         STORE( &r[0], 0, chan_index );
3025      }
3026      break;
3027
3028   case TGSI_OPCODE_NOT:
3029      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3030         FETCH( &r[0], 0, chan_index );
3031         micro_not( &r[0], &r[0] );
3032         STORE( &r[0], 0, chan_index );
3033      }
3034      break;
3035
3036   case TGSI_OPCODE_TRUNC:
3037      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3038         FETCH( &r[0], 0, chan_index );
3039         micro_trunc( &r[0], &r[0] );
3040         STORE( &r[0], 0, chan_index );
3041      }
3042      break;
3043
3044   case TGSI_OPCODE_SHL:
3045      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3046         FETCH( &r[0], 0, chan_index );
3047         FETCH( &r[1], 1, chan_index );
3048         micro_shl( &r[0], &r[0], &r[1] );
3049         STORE( &r[0], 0, chan_index );
3050      }
3051      break;
3052
3053   case TGSI_OPCODE_SHR:
3054      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3055         FETCH( &r[0], 0, chan_index );
3056         FETCH( &r[1], 1, chan_index );
3057         micro_ishr( &r[0], &r[0], &r[1] );
3058         STORE( &r[0], 0, chan_index );
3059      }
3060      break;
3061
3062   case TGSI_OPCODE_AND:
3063      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3064         FETCH( &r[0], 0, chan_index );
3065         FETCH( &r[1], 1, chan_index );
3066         micro_and( &r[0], &r[0], &r[1] );
3067         STORE( &r[0], 0, chan_index );
3068      }
3069      break;
3070
3071   case TGSI_OPCODE_OR:
3072      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3073         FETCH( &r[0], 0, chan_index );
3074         FETCH( &r[1], 1, chan_index );
3075         micro_or( &r[0], &r[0], &r[1] );
3076         STORE( &r[0], 0, chan_index );
3077      }
3078      break;
3079
3080   case TGSI_OPCODE_MOD:
3081      assert (0);
3082      break;
3083
3084   case TGSI_OPCODE_XOR:
3085      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3086         FETCH( &r[0], 0, chan_index );
3087         FETCH( &r[1], 1, chan_index );
3088         micro_xor( &r[0], &r[0], &r[1] );
3089         STORE( &r[0], 0, chan_index );
3090      }
3091      break;
3092
3093   case TGSI_OPCODE_SAD:
3094      assert (0);
3095      break;
3096
3097   case TGSI_OPCODE_TXF:
3098      assert (0);
3099      break;
3100
3101   case TGSI_OPCODE_TXQ:
3102      assert (0);
3103      break;
3104
3105   case TGSI_OPCODE_EMIT:
3106      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
3107      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
3108      break;
3109
3110   case TGSI_OPCODE_ENDPRIM:
3111      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
3112      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
3113      break;
3114
3115   case TGSI_OPCODE_BGNFOR:
3116      assert(mach->LoopCounterStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3117      for (chan_index = 0; chan_index < 3; chan_index++) {
3118         FETCH( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[chan_index], 0, chan_index );
3119      }
3120      STORE( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_Y], 0, CHAN_X );
3121      ++mach->LoopCounterStackTop;
3122      /* fall-through (for now) */
3123   case TGSI_OPCODE_BGNLOOP:
3124      /* push LoopMask and ContMasks */
3125      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3126      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3127      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3128      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3129      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3130      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3131      break;
3132
3133   case TGSI_OPCODE_ENDFOR:
3134      assert(mach->LoopCounterStackTop > 0);
3135      micro_sub( &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3136                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3137                 &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
3138      /* update LoopMask */
3139      if( mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X].f[0] <= 0) {
3140         mach->LoopMask &= ~0x1;
3141      }
3142      if( mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X].f[1] <= 0 ) {
3143         mach->LoopMask &= ~0x2;
3144      }
3145      if( mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X].f[2] <= 0 ) {
3146         mach->LoopMask &= ~0x4;
3147      }
3148      if( mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X].f[3] <= 0 ) {
3149         mach->LoopMask &= ~0x8;
3150      }
3151      micro_add( &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3152                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3153                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Z]);
3154      assert(mach->LoopLabelStackTop > 0);
3155      inst = mach->Instructions + mach->LoopLabelStack[mach->LoopLabelStackTop - 1];
3156      STORE( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_Y], 0, CHAN_X );
3157      /* Restore ContMask, but don't pop */
3158      assert(mach->ContStackTop > 0);
3159      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3160      UPDATE_EXEC_MASK(mach);
3161      if (mach->ExecMask) {
3162         /* repeat loop: jump to instruction just past BGNLOOP */
3163         assert(mach->LoopLabelStackTop > 0);
3164         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3165      }
3166      else {
3167         /* exit loop: pop LoopMask */
3168         assert(mach->LoopStackTop > 0);
3169         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3170         /* pop ContMask */
3171         assert(mach->ContStackTop > 0);
3172         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3173         assert(mach->LoopLabelStackTop > 0);
3174         --mach->LoopLabelStackTop;
3175         assert(mach->LoopCounterStackTop > 0);
3176         --mach->LoopCounterStackTop;
3177      }
3178      UPDATE_EXEC_MASK(mach);
3179      break;
3180
3181   case TGSI_OPCODE_ENDLOOP:
3182      /* Restore ContMask, but don't pop */
3183      assert(mach->ContStackTop > 0);
3184      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3185      UPDATE_EXEC_MASK(mach);
3186      if (mach->ExecMask) {
3187         /* repeat loop: jump to instruction just past BGNLOOP */
3188         assert(mach->LoopLabelStackTop > 0);
3189         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3190      }
3191      else {
3192         /* exit loop: pop LoopMask */
3193         assert(mach->LoopStackTop > 0);
3194         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3195         /* pop ContMask */
3196         assert(mach->ContStackTop > 0);
3197         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3198         assert(mach->LoopLabelStackTop > 0);
3199         --mach->LoopLabelStackTop;
3200      }
3201      UPDATE_EXEC_MASK(mach);
3202      break;
3203
3204   case TGSI_OPCODE_BRK:
3205      /* turn off loop channels for each enabled exec channel */
3206      mach->LoopMask &= ~mach->ExecMask;
3207      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3208      UPDATE_EXEC_MASK(mach);
3209      break;
3210
3211   case TGSI_OPCODE_CONT:
3212      /* turn off cont channels for each enabled exec channel */
3213      mach->ContMask &= ~mach->ExecMask;
3214      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3215      UPDATE_EXEC_MASK(mach);
3216      break;
3217
3218   case TGSI_OPCODE_BGNSUB:
3219      /* no-op */
3220      break;
3221
3222   case TGSI_OPCODE_ENDSUB:
3223      /* no-op */
3224      break;
3225
3226   case TGSI_OPCODE_NOP:
3227      break;
3228
3229   default:
3230      assert( 0 );
3231   }
3232}
3233
3234
3235/**
3236 * Run TGSI interpreter.
3237 * \return bitmask of "alive" quad components
3238 */
3239uint
3240tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
3241{
3242   uint i;
3243   int pc = 0;
3244
3245   mach->CondMask = 0xf;
3246   mach->LoopMask = 0xf;
3247   mach->ContMask = 0xf;
3248   mach->FuncMask = 0xf;
3249   mach->ExecMask = 0xf;
3250
3251   assert(mach->CondStackTop == 0);
3252   assert(mach->LoopStackTop == 0);
3253   assert(mach->ContStackTop == 0);
3254   assert(mach->CallStackTop == 0);
3255
3256   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
3257   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
3258
3259   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
3260      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
3261      mach->Primitives[0] = 0;
3262   }
3263
3264   for (i = 0; i < QUAD_SIZE; i++) {
3265      mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
3266         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
3267         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
3268         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
3269         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
3270   }
3271
3272   /* execute declarations (interpolants) */
3273   for (i = 0; i < mach->NumDeclarations; i++) {
3274      exec_declaration( mach, mach->Declarations+i );
3275   }
3276
3277   /* execute instructions, until pc is set to -1 */
3278   while (pc != -1) {
3279      assert(pc < (int) mach->NumInstructions);
3280      exec_instruction( mach, mach->Instructions + pc, &pc );
3281   }
3282
3283#if 0
3284   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
3285   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
3286      /*
3287       * Scale back depth component.
3288       */
3289      for (i = 0; i < 4; i++)
3290         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
3291   }
3292#endif
3293
3294   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3295}
3296