tgsi_exec.c revision ba1ca28cc62fed71c77902b95ae4ed36c6bf25f8
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28/**
29 * TGSI interpreter/executor.
30 *
31 * Flow control information:
32 *
33 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
34 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
35 * care since a condition may be true for some quad components but false
36 * for other components.
37 *
38 * We basically execute all statements (even if they're in the part of
39 * an IF/ELSE clause that's "not taken") and use a special mask to
40 * control writing to destination registers.  This is the ExecMask.
41 * See store_dest().
42 *
43 * The ExecMask is computed from three other masks (CondMask, LoopMask and
44 * ContMask) which are controlled by the flow control instructions (namely:
45 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
46 *
47 *
48 * Authors:
49 *   Michal Krol
50 *   Brian Paul
51 */
52
53#include "pipe/p_compiler.h"
54#include "pipe/p_state.h"
55#include "pipe/p_shader_tokens.h"
56#include "tgsi/tgsi_dump.h"
57#include "tgsi/tgsi_parse.h"
58#include "tgsi/tgsi_util.h"
59#include "tgsi_exec.h"
60#include "util/u_memory.h"
61#include "util/u_math.h"
62
63#define FAST_MATH 1
64
65#define TILE_TOP_LEFT     0
66#define TILE_TOP_RIGHT    1
67#define TILE_BOTTOM_LEFT  2
68#define TILE_BOTTOM_RIGHT 3
69
70#define CHAN_X  0
71#define CHAN_Y  1
72#define CHAN_Z  2
73#define CHAN_W  3
74
75/*
76 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
77 */
78#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
79#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
80#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
81#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
82#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
83#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
84#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
85#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
86#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
87#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
88#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
89#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
90#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
91#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
92#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
93#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
94#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
95#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
96#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
97#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
98#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
99#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
100#define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
101#define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
102#define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
103#define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
104#define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
105#define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
106#define TEMP_R0            TGSI_EXEC_TEMP_R0
107#define TEMP_P0            TGSI_EXEC_TEMP_P0
108
109#define IS_CHANNEL_ENABLED(INST, CHAN)\
110   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
111
112#define IS_CHANNEL_ENABLED2(INST, CHAN)\
113   ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
114
115#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
116   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
117      if (IS_CHANNEL_ENABLED( INST, CHAN ))
118
119#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
120   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
121      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
122
123
124/** The execution mask depends on the conditional mask and the loop mask */
125#define UPDATE_EXEC_MASK(MACH) \
126      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
127
128
129static const union tgsi_exec_channel ZeroVec =
130   { { 0.0, 0.0, 0.0, 0.0 } };
131
132
133#ifdef DEBUG
134static void
135check_inf_or_nan(const union tgsi_exec_channel *chan)
136{
137   assert(!util_is_inf_or_nan(chan->f[0]));
138   assert(!util_is_inf_or_nan(chan->f[1]));
139   assert(!util_is_inf_or_nan(chan->f[2]));
140   assert(!util_is_inf_or_nan(chan->f[3]));
141}
142#endif
143
144
145#ifdef DEBUG
146static void
147print_chan(const char *msg, const union tgsi_exec_channel *chan)
148{
149   debug_printf("%s = {%f, %f, %f, %f}\n",
150                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
151}
152#endif
153
154
155#ifdef DEBUG
156static void
157print_temp(const struct tgsi_exec_machine *mach, uint index)
158{
159   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
160   int i;
161   debug_printf("Temp[%u] =\n", index);
162   for (i = 0; i < 4; i++) {
163      debug_printf("  %c: { %f, %f, %f, %f }\n",
164                   "XYZW"[i],
165                   tmp->xyzw[i].f[0],
166                   tmp->xyzw[i].f[1],
167                   tmp->xyzw[i].f[2],
168                   tmp->xyzw[i].f[3]);
169   }
170}
171#endif
172
173
174/**
175 * Check if there's a potential src/dst register data dependency when
176 * using SOA execution.
177 * Example:
178 *   MOV T, T.yxwz;
179 * This would expand into:
180 *   MOV t0, t1;
181 *   MOV t1, t0;
182 *   MOV t2, t3;
183 *   MOV t3, t2;
184 * The second instruction will have the wrong value for t0 if executed as-is.
185 */
186boolean
187tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
188{
189   uint i, chan;
190
191   uint writemask = inst->FullDstRegisters[0].DstRegister.WriteMask;
192   if (writemask == TGSI_WRITEMASK_X ||
193       writemask == TGSI_WRITEMASK_Y ||
194       writemask == TGSI_WRITEMASK_Z ||
195       writemask == TGSI_WRITEMASK_W ||
196       writemask == TGSI_WRITEMASK_NONE) {
197      /* no chance of data dependency */
198      return FALSE;
199   }
200
201   /* loop over src regs */
202   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
203      if ((inst->FullSrcRegisters[i].SrcRegister.File ==
204           inst->FullDstRegisters[0].DstRegister.File) &&
205          (inst->FullSrcRegisters[i].SrcRegister.Index ==
206           inst->FullDstRegisters[0].DstRegister.Index)) {
207         /* loop over dest channels */
208         uint channelsWritten = 0x0;
209         FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
210            /* check if we're reading a channel that's been written */
211            uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->FullSrcRegisters[i], chan);
212            if (channelsWritten & (1 << swizzle)) {
213               return TRUE;
214            }
215
216            channelsWritten |= (1 << chan);
217         }
218      }
219   }
220   return FALSE;
221}
222
223
224/**
225 * Initialize machine state by expanding tokens to full instructions,
226 * allocating temporary storage, setting up constants, etc.
227 * After this, we can call tgsi_exec_machine_run() many times.
228 */
229void
230tgsi_exec_machine_bind_shader(
231   struct tgsi_exec_machine *mach,
232   const struct tgsi_token *tokens,
233   uint numSamplers,
234   struct tgsi_sampler **samplers)
235{
236   uint k;
237   struct tgsi_parse_context parse;
238   struct tgsi_exec_labels *labels = &mach->Labels;
239   struct tgsi_full_instruction *instructions;
240   struct tgsi_full_declaration *declarations;
241   uint maxInstructions = 10, numInstructions = 0;
242   uint maxDeclarations = 10, numDeclarations = 0;
243   uint instno = 0;
244
245#if 0
246   tgsi_dump(tokens, 0);
247#endif
248
249   util_init_math();
250
251   mach->Tokens = tokens;
252   mach->Samplers = samplers;
253
254   k = tgsi_parse_init (&parse, mach->Tokens);
255   if (k != TGSI_PARSE_OK) {
256      debug_printf( "Problem parsing!\n" );
257      return;
258   }
259
260   mach->Processor = parse.FullHeader.Processor.Processor;
261   mach->ImmLimit = 0;
262   labels->count = 0;
263
264   declarations = (struct tgsi_full_declaration *)
265      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
266
267   if (!declarations) {
268      return;
269   }
270
271   instructions = (struct tgsi_full_instruction *)
272      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
273
274   if (!instructions) {
275      FREE( declarations );
276      return;
277   }
278
279   while( !tgsi_parse_end_of_tokens( &parse ) ) {
280      uint pointer = parse.Position;
281      uint i;
282
283      tgsi_parse_token( &parse );
284      switch( parse.FullToken.Token.Type ) {
285      case TGSI_TOKEN_TYPE_DECLARATION:
286         /* save expanded declaration */
287         if (numDeclarations == maxDeclarations) {
288            declarations = REALLOC(declarations,
289                                   maxDeclarations
290                                   * sizeof(struct tgsi_full_declaration),
291                                   (maxDeclarations + 10)
292                                   * sizeof(struct tgsi_full_declaration));
293            maxDeclarations += 10;
294         }
295         memcpy(declarations + numDeclarations,
296                &parse.FullToken.FullDeclaration,
297                sizeof(declarations[0]));
298         numDeclarations++;
299         break;
300
301      case TGSI_TOKEN_TYPE_IMMEDIATE:
302         {
303            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
304            assert( size <= 4 );
305            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
306
307            for( i = 0; i < size; i++ ) {
308               mach->Imms[mach->ImmLimit][i] =
309		  parse.FullToken.FullImmediate.u[i].Float;
310            }
311            mach->ImmLimit += 1;
312         }
313         break;
314
315      case TGSI_TOKEN_TYPE_INSTRUCTION:
316         assert( labels->count < MAX_LABELS );
317
318         labels->labels[labels->count][0] = instno;
319         labels->labels[labels->count][1] = pointer;
320         labels->count++;
321
322         /* save expanded instruction */
323         if (numInstructions == maxInstructions) {
324            instructions = REALLOC(instructions,
325                                   maxInstructions
326                                   * sizeof(struct tgsi_full_instruction),
327                                   (maxInstructions + 10)
328                                   * sizeof(struct tgsi_full_instruction));
329            maxInstructions += 10;
330         }
331
332         memcpy(instructions + numInstructions,
333                &parse.FullToken.FullInstruction,
334                sizeof(instructions[0]));
335
336         numInstructions++;
337         break;
338
339      default:
340         assert( 0 );
341      }
342   }
343   tgsi_parse_free (&parse);
344
345   if (mach->Declarations) {
346      FREE( mach->Declarations );
347   }
348   mach->Declarations = declarations;
349   mach->NumDeclarations = numDeclarations;
350
351   if (mach->Instructions) {
352      FREE( mach->Instructions );
353   }
354   mach->Instructions = instructions;
355   mach->NumInstructions = numInstructions;
356}
357
358
359struct tgsi_exec_machine *
360tgsi_exec_machine_create( void )
361{
362   struct tgsi_exec_machine *mach;
363   uint i;
364
365   mach = align_malloc( sizeof *mach, 16 );
366   if (!mach)
367      goto fail;
368
369   memset(mach, 0, sizeof(*mach));
370
371   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
372   mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
373
374   /* Setup constants. */
375   for( i = 0; i < 4; i++ ) {
376      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
377      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
378      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
379      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
380      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
381      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
382      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
383      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
384      mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
385      mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
386   }
387
388#ifdef DEBUG
389   /* silence warnings */
390   (void) print_chan;
391   (void) print_temp;
392#endif
393
394   return mach;
395
396fail:
397   align_free(mach);
398   return NULL;
399}
400
401
402void
403tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
404{
405   if (mach) {
406      FREE(mach->Instructions);
407      FREE(mach->Declarations);
408   }
409
410   align_free(mach);
411}
412
413
414static void
415micro_abs(
416   union tgsi_exec_channel *dst,
417   const union tgsi_exec_channel *src )
418{
419   dst->f[0] = fabsf( src->f[0] );
420   dst->f[1] = fabsf( src->f[1] );
421   dst->f[2] = fabsf( src->f[2] );
422   dst->f[3] = fabsf( src->f[3] );
423}
424
425static void
426micro_add(
427   union tgsi_exec_channel *dst,
428   const union tgsi_exec_channel *src0,
429   const union tgsi_exec_channel *src1 )
430{
431   dst->f[0] = src0->f[0] + src1->f[0];
432   dst->f[1] = src0->f[1] + src1->f[1];
433   dst->f[2] = src0->f[2] + src1->f[2];
434   dst->f[3] = src0->f[3] + src1->f[3];
435}
436
437#if 0
438static void
439micro_iadd(
440   union tgsi_exec_channel *dst,
441   const union tgsi_exec_channel *src0,
442   const union tgsi_exec_channel *src1 )
443{
444   dst->i[0] = src0->i[0] + src1->i[0];
445   dst->i[1] = src0->i[1] + src1->i[1];
446   dst->i[2] = src0->i[2] + src1->i[2];
447   dst->i[3] = src0->i[3] + src1->i[3];
448}
449#endif
450
451static void
452micro_and(
453   union tgsi_exec_channel *dst,
454   const union tgsi_exec_channel *src0,
455   const union tgsi_exec_channel *src1 )
456{
457   dst->u[0] = src0->u[0] & src1->u[0];
458   dst->u[1] = src0->u[1] & src1->u[1];
459   dst->u[2] = src0->u[2] & src1->u[2];
460   dst->u[3] = src0->u[3] & src1->u[3];
461}
462
463static void
464micro_ceil(
465   union tgsi_exec_channel *dst,
466   const union tgsi_exec_channel *src )
467{
468   dst->f[0] = ceilf( src->f[0] );
469   dst->f[1] = ceilf( src->f[1] );
470   dst->f[2] = ceilf( src->f[2] );
471   dst->f[3] = ceilf( src->f[3] );
472}
473
474static void
475micro_cos(
476   union tgsi_exec_channel *dst,
477   const union tgsi_exec_channel *src )
478{
479   dst->f[0] = cosf( src->f[0] );
480   dst->f[1] = cosf( src->f[1] );
481   dst->f[2] = cosf( src->f[2] );
482   dst->f[3] = cosf( src->f[3] );
483}
484
485static void
486micro_ddx(
487   union tgsi_exec_channel *dst,
488   const union tgsi_exec_channel *src )
489{
490   dst->f[0] =
491   dst->f[1] =
492   dst->f[2] =
493   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
494}
495
496static void
497micro_ddy(
498   union tgsi_exec_channel *dst,
499   const union tgsi_exec_channel *src )
500{
501   dst->f[0] =
502   dst->f[1] =
503   dst->f[2] =
504   dst->f[3] = src->f[TILE_TOP_LEFT] - src->f[TILE_BOTTOM_LEFT];
505}
506
507static void
508micro_div(
509   union tgsi_exec_channel *dst,
510   const union tgsi_exec_channel *src0,
511   const union tgsi_exec_channel *src1 )
512{
513   if (src1->f[0] != 0) {
514      dst->f[0] = src0->f[0] / src1->f[0];
515   }
516   if (src1->f[1] != 0) {
517      dst->f[1] = src0->f[1] / src1->f[1];
518   }
519   if (src1->f[2] != 0) {
520      dst->f[2] = src0->f[2] / src1->f[2];
521   }
522   if (src1->f[3] != 0) {
523      dst->f[3] = src0->f[3] / src1->f[3];
524   }
525}
526
527#if 0
528static void
529micro_udiv(
530   union tgsi_exec_channel *dst,
531   const union tgsi_exec_channel *src0,
532   const union tgsi_exec_channel *src1 )
533{
534   dst->u[0] = src0->u[0] / src1->u[0];
535   dst->u[1] = src0->u[1] / src1->u[1];
536   dst->u[2] = src0->u[2] / src1->u[2];
537   dst->u[3] = src0->u[3] / src1->u[3];
538}
539#endif
540
541static void
542micro_eq(
543   union tgsi_exec_channel *dst,
544   const union tgsi_exec_channel *src0,
545   const union tgsi_exec_channel *src1,
546   const union tgsi_exec_channel *src2,
547   const union tgsi_exec_channel *src3 )
548{
549   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
550   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
551   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
552   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
553}
554
555#if 0
556static void
557micro_ieq(
558   union tgsi_exec_channel *dst,
559   const union tgsi_exec_channel *src0,
560   const union tgsi_exec_channel *src1,
561   const union tgsi_exec_channel *src2,
562   const union tgsi_exec_channel *src3 )
563{
564   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
565   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
566   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
567   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
568}
569#endif
570
571static void
572micro_exp2(
573   union tgsi_exec_channel *dst,
574   const union tgsi_exec_channel *src)
575{
576#if FAST_MATH
577   dst->f[0] = util_fast_exp2( src->f[0] );
578   dst->f[1] = util_fast_exp2( src->f[1] );
579   dst->f[2] = util_fast_exp2( src->f[2] );
580   dst->f[3] = util_fast_exp2( src->f[3] );
581#else
582
583#if DEBUG
584   /* Inf is okay for this instruction, so clamp it to silence assertions. */
585   uint i;
586   union tgsi_exec_channel clamped;
587
588   for (i = 0; i < 4; i++) {
589      if (src->f[i] > 127.99999f) {
590         clamped.f[i] = 127.99999f;
591      } else if (src->f[i] < -126.99999f) {
592         clamped.f[i] = -126.99999f;
593      } else {
594         clamped.f[i] = src->f[i];
595      }
596   }
597   src = &clamped;
598#endif
599
600   dst->f[0] = powf( 2.0f, src->f[0] );
601   dst->f[1] = powf( 2.0f, src->f[1] );
602   dst->f[2] = powf( 2.0f, src->f[2] );
603   dst->f[3] = powf( 2.0f, src->f[3] );
604#endif
605}
606
607#if 0
608static void
609micro_f2ut(
610   union tgsi_exec_channel *dst,
611   const union tgsi_exec_channel *src )
612{
613   dst->u[0] = (uint) src->f[0];
614   dst->u[1] = (uint) src->f[1];
615   dst->u[2] = (uint) src->f[2];
616   dst->u[3] = (uint) src->f[3];
617}
618#endif
619
620static void
621micro_float_clamp(union tgsi_exec_channel *dst,
622                  const union tgsi_exec_channel *src)
623{
624   uint i;
625
626   for (i = 0; i < 4; i++) {
627      if (src->f[i] > 0.0f) {
628         if (src->f[i] > 1.884467e+019f)
629            dst->f[i] = 1.884467e+019f;
630         else if (src->f[i] < 5.42101e-020f)
631            dst->f[i] = 5.42101e-020f;
632         else
633            dst->f[i] = src->f[i];
634      }
635      else {
636         if (src->f[i] < -1.884467e+019f)
637            dst->f[i] = -1.884467e+019f;
638         else if (src->f[i] > -5.42101e-020f)
639            dst->f[i] = -5.42101e-020f;
640         else
641            dst->f[i] = src->f[i];
642      }
643   }
644}
645
646static void
647micro_flr(
648   union tgsi_exec_channel *dst,
649   const union tgsi_exec_channel *src )
650{
651   dst->f[0] = floorf( src->f[0] );
652   dst->f[1] = floorf( src->f[1] );
653   dst->f[2] = floorf( src->f[2] );
654   dst->f[3] = floorf( src->f[3] );
655}
656
657static void
658micro_frc(
659   union tgsi_exec_channel *dst,
660   const union tgsi_exec_channel *src )
661{
662   dst->f[0] = src->f[0] - floorf( src->f[0] );
663   dst->f[1] = src->f[1] - floorf( src->f[1] );
664   dst->f[2] = src->f[2] - floorf( src->f[2] );
665   dst->f[3] = src->f[3] - floorf( src->f[3] );
666}
667
668static void
669micro_i2f(
670   union tgsi_exec_channel *dst,
671   const union tgsi_exec_channel *src )
672{
673   dst->f[0] = (float) src->i[0];
674   dst->f[1] = (float) src->i[1];
675   dst->f[2] = (float) src->i[2];
676   dst->f[3] = (float) src->i[3];
677}
678
679static void
680micro_lg2(
681   union tgsi_exec_channel *dst,
682   const union tgsi_exec_channel *src )
683{
684#if FAST_MATH
685   dst->f[0] = util_fast_log2( src->f[0] );
686   dst->f[1] = util_fast_log2( src->f[1] );
687   dst->f[2] = util_fast_log2( src->f[2] );
688   dst->f[3] = util_fast_log2( src->f[3] );
689#else
690   dst->f[0] = logf( src->f[0] ) * 1.442695f;
691   dst->f[1] = logf( src->f[1] ) * 1.442695f;
692   dst->f[2] = logf( src->f[2] ) * 1.442695f;
693   dst->f[3] = logf( src->f[3] ) * 1.442695f;
694#endif
695}
696
697static void
698micro_le(
699   union tgsi_exec_channel *dst,
700   const union tgsi_exec_channel *src0,
701   const union tgsi_exec_channel *src1,
702   const union tgsi_exec_channel *src2,
703   const union tgsi_exec_channel *src3 )
704{
705   dst->f[0] = src0->f[0] <= src1->f[0] ? src2->f[0] : src3->f[0];
706   dst->f[1] = src0->f[1] <= src1->f[1] ? src2->f[1] : src3->f[1];
707   dst->f[2] = src0->f[2] <= src1->f[2] ? src2->f[2] : src3->f[2];
708   dst->f[3] = src0->f[3] <= src1->f[3] ? src2->f[3] : src3->f[3];
709}
710
711static void
712micro_lt(
713   union tgsi_exec_channel *dst,
714   const union tgsi_exec_channel *src0,
715   const union tgsi_exec_channel *src1,
716   const union tgsi_exec_channel *src2,
717   const union tgsi_exec_channel *src3 )
718{
719   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
720   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
721   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
722   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
723}
724
725#if 0
726static void
727micro_ilt(
728   union tgsi_exec_channel *dst,
729   const union tgsi_exec_channel *src0,
730   const union tgsi_exec_channel *src1,
731   const union tgsi_exec_channel *src2,
732   const union tgsi_exec_channel *src3 )
733{
734   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
735   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
736   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
737   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
738}
739#endif
740
741#if 0
742static void
743micro_ult(
744   union tgsi_exec_channel *dst,
745   const union tgsi_exec_channel *src0,
746   const union tgsi_exec_channel *src1,
747   const union tgsi_exec_channel *src2,
748   const union tgsi_exec_channel *src3 )
749{
750   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
751   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
752   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
753   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
754}
755#endif
756
757static void
758micro_max(
759   union tgsi_exec_channel *dst,
760   const union tgsi_exec_channel *src0,
761   const union tgsi_exec_channel *src1 )
762{
763   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
764   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
765   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
766   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
767}
768
769#if 0
770static void
771micro_imax(
772   union tgsi_exec_channel *dst,
773   const union tgsi_exec_channel *src0,
774   const union tgsi_exec_channel *src1 )
775{
776   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
777   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
778   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
779   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
780}
781#endif
782
783#if 0
784static void
785micro_umax(
786   union tgsi_exec_channel *dst,
787   const union tgsi_exec_channel *src0,
788   const union tgsi_exec_channel *src1 )
789{
790   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
791   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
792   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
793   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
794}
795#endif
796
797static void
798micro_min(
799   union tgsi_exec_channel *dst,
800   const union tgsi_exec_channel *src0,
801   const union tgsi_exec_channel *src1 )
802{
803   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
804   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
805   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
806   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
807}
808
809#if 0
810static void
811micro_imin(
812   union tgsi_exec_channel *dst,
813   const union tgsi_exec_channel *src0,
814   const union tgsi_exec_channel *src1 )
815{
816   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
817   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
818   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
819   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
820}
821#endif
822
823#if 0
824static void
825micro_umin(
826   union tgsi_exec_channel *dst,
827   const union tgsi_exec_channel *src0,
828   const union tgsi_exec_channel *src1 )
829{
830   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
831   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
832   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
833   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
834}
835#endif
836
837#if 0
838static void
839micro_umod(
840   union tgsi_exec_channel *dst,
841   const union tgsi_exec_channel *src0,
842   const union tgsi_exec_channel *src1 )
843{
844   dst->u[0] = src0->u[0] % src1->u[0];
845   dst->u[1] = src0->u[1] % src1->u[1];
846   dst->u[2] = src0->u[2] % src1->u[2];
847   dst->u[3] = src0->u[3] % src1->u[3];
848}
849#endif
850
851static void
852micro_mul(
853   union tgsi_exec_channel *dst,
854   const union tgsi_exec_channel *src0,
855   const union tgsi_exec_channel *src1 )
856{
857   dst->f[0] = src0->f[0] * src1->f[0];
858   dst->f[1] = src0->f[1] * src1->f[1];
859   dst->f[2] = src0->f[2] * src1->f[2];
860   dst->f[3] = src0->f[3] * src1->f[3];
861}
862
863#if 0
864static void
865micro_imul(
866   union tgsi_exec_channel *dst,
867   const union tgsi_exec_channel *src0,
868   const union tgsi_exec_channel *src1 )
869{
870   dst->i[0] = src0->i[0] * src1->i[0];
871   dst->i[1] = src0->i[1] * src1->i[1];
872   dst->i[2] = src0->i[2] * src1->i[2];
873   dst->i[3] = src0->i[3] * src1->i[3];
874}
875#endif
876
877#if 0
878static void
879micro_imul64(
880   union tgsi_exec_channel *dst0,
881   union tgsi_exec_channel *dst1,
882   const union tgsi_exec_channel *src0,
883   const union tgsi_exec_channel *src1 )
884{
885   dst1->i[0] = src0->i[0] * src1->i[0];
886   dst1->i[1] = src0->i[1] * src1->i[1];
887   dst1->i[2] = src0->i[2] * src1->i[2];
888   dst1->i[3] = src0->i[3] * src1->i[3];
889   dst0->i[0] = 0;
890   dst0->i[1] = 0;
891   dst0->i[2] = 0;
892   dst0->i[3] = 0;
893}
894#endif
895
896#if 0
897static void
898micro_umul64(
899   union tgsi_exec_channel *dst0,
900   union tgsi_exec_channel *dst1,
901   const union tgsi_exec_channel *src0,
902   const union tgsi_exec_channel *src1 )
903{
904   dst1->u[0] = src0->u[0] * src1->u[0];
905   dst1->u[1] = src0->u[1] * src1->u[1];
906   dst1->u[2] = src0->u[2] * src1->u[2];
907   dst1->u[3] = src0->u[3] * src1->u[3];
908   dst0->u[0] = 0;
909   dst0->u[1] = 0;
910   dst0->u[2] = 0;
911   dst0->u[3] = 0;
912}
913#endif
914
915
916#if 0
917static void
918micro_movc(
919   union tgsi_exec_channel *dst,
920   const union tgsi_exec_channel *src0,
921   const union tgsi_exec_channel *src1,
922   const union tgsi_exec_channel *src2 )
923{
924   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
925   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
926   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
927   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
928}
929#endif
930
931static void
932micro_neg(
933   union tgsi_exec_channel *dst,
934   const union tgsi_exec_channel *src )
935{
936   dst->f[0] = -src->f[0];
937   dst->f[1] = -src->f[1];
938   dst->f[2] = -src->f[2];
939   dst->f[3] = -src->f[3];
940}
941
942#if 0
943static void
944micro_ineg(
945   union tgsi_exec_channel *dst,
946   const union tgsi_exec_channel *src )
947{
948   dst->i[0] = -src->i[0];
949   dst->i[1] = -src->i[1];
950   dst->i[2] = -src->i[2];
951   dst->i[3] = -src->i[3];
952}
953#endif
954
955static void
956micro_not(
957   union tgsi_exec_channel *dst,
958   const union tgsi_exec_channel *src )
959{
960   dst->u[0] = ~src->u[0];
961   dst->u[1] = ~src->u[1];
962   dst->u[2] = ~src->u[2];
963   dst->u[3] = ~src->u[3];
964}
965
966static void
967micro_or(
968   union tgsi_exec_channel *dst,
969   const union tgsi_exec_channel *src0,
970   const union tgsi_exec_channel *src1 )
971{
972   dst->u[0] = src0->u[0] | src1->u[0];
973   dst->u[1] = src0->u[1] | src1->u[1];
974   dst->u[2] = src0->u[2] | src1->u[2];
975   dst->u[3] = src0->u[3] | src1->u[3];
976}
977
978static void
979micro_pow(
980   union tgsi_exec_channel *dst,
981   const union tgsi_exec_channel *src0,
982   const union tgsi_exec_channel *src1 )
983{
984#if FAST_MATH
985   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
986   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
987   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
988   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
989#else
990   dst->f[0] = powf( src0->f[0], src1->f[0] );
991   dst->f[1] = powf( src0->f[1], src1->f[1] );
992   dst->f[2] = powf( src0->f[2], src1->f[2] );
993   dst->f[3] = powf( src0->f[3], src1->f[3] );
994#endif
995}
996
997static void
998micro_rnd(
999   union tgsi_exec_channel *dst,
1000   const union tgsi_exec_channel *src )
1001{
1002   dst->f[0] = floorf( src->f[0] + 0.5f );
1003   dst->f[1] = floorf( src->f[1] + 0.5f );
1004   dst->f[2] = floorf( src->f[2] + 0.5f );
1005   dst->f[3] = floorf( src->f[3] + 0.5f );
1006}
1007
1008static void
1009micro_sgn(
1010   union tgsi_exec_channel *dst,
1011   const union tgsi_exec_channel *src )
1012{
1013   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
1014   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
1015   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
1016   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
1017}
1018
1019static void
1020micro_shl(
1021   union tgsi_exec_channel *dst,
1022   const union tgsi_exec_channel *src0,
1023   const union tgsi_exec_channel *src1 )
1024{
1025   dst->i[0] = src0->i[0] << src1->i[0];
1026   dst->i[1] = src0->i[1] << src1->i[1];
1027   dst->i[2] = src0->i[2] << src1->i[2];
1028   dst->i[3] = src0->i[3] << src1->i[3];
1029}
1030
1031static void
1032micro_ishr(
1033   union tgsi_exec_channel *dst,
1034   const union tgsi_exec_channel *src0,
1035   const union tgsi_exec_channel *src1 )
1036{
1037   dst->i[0] = src0->i[0] >> src1->i[0];
1038   dst->i[1] = src0->i[1] >> src1->i[1];
1039   dst->i[2] = src0->i[2] >> src1->i[2];
1040   dst->i[3] = src0->i[3] >> src1->i[3];
1041}
1042
1043static void
1044micro_trunc(
1045   union tgsi_exec_channel *dst,
1046   const union tgsi_exec_channel *src0 )
1047{
1048   dst->f[0] = (float) (int) src0->f[0];
1049   dst->f[1] = (float) (int) src0->f[1];
1050   dst->f[2] = (float) (int) src0->f[2];
1051   dst->f[3] = (float) (int) src0->f[3];
1052}
1053
1054#if 0
1055static void
1056micro_ushr(
1057   union tgsi_exec_channel *dst,
1058   const union tgsi_exec_channel *src0,
1059   const union tgsi_exec_channel *src1 )
1060{
1061   dst->u[0] = src0->u[0] >> src1->u[0];
1062   dst->u[1] = src0->u[1] >> src1->u[1];
1063   dst->u[2] = src0->u[2] >> src1->u[2];
1064   dst->u[3] = src0->u[3] >> src1->u[3];
1065}
1066#endif
1067
1068static void
1069micro_sin(
1070   union tgsi_exec_channel *dst,
1071   const union tgsi_exec_channel *src )
1072{
1073   dst->f[0] = sinf( src->f[0] );
1074   dst->f[1] = sinf( src->f[1] );
1075   dst->f[2] = sinf( src->f[2] );
1076   dst->f[3] = sinf( src->f[3] );
1077}
1078
1079static void
1080micro_sqrt( union tgsi_exec_channel *dst,
1081            const union tgsi_exec_channel *src )
1082{
1083   dst->f[0] = sqrtf( src->f[0] );
1084   dst->f[1] = sqrtf( src->f[1] );
1085   dst->f[2] = sqrtf( src->f[2] );
1086   dst->f[3] = sqrtf( src->f[3] );
1087}
1088
1089static void
1090micro_sub(
1091   union tgsi_exec_channel *dst,
1092   const union tgsi_exec_channel *src0,
1093   const union tgsi_exec_channel *src1 )
1094{
1095   dst->f[0] = src0->f[0] - src1->f[0];
1096   dst->f[1] = src0->f[1] - src1->f[1];
1097   dst->f[2] = src0->f[2] - src1->f[2];
1098   dst->f[3] = src0->f[3] - src1->f[3];
1099}
1100
1101#if 0
1102static void
1103micro_u2f(
1104   union tgsi_exec_channel *dst,
1105   const union tgsi_exec_channel *src )
1106{
1107   dst->f[0] = (float) src->u[0];
1108   dst->f[1] = (float) src->u[1];
1109   dst->f[2] = (float) src->u[2];
1110   dst->f[3] = (float) src->u[3];
1111}
1112#endif
1113
1114static void
1115micro_xor(
1116   union tgsi_exec_channel *dst,
1117   const union tgsi_exec_channel *src0,
1118   const union tgsi_exec_channel *src1 )
1119{
1120   dst->u[0] = src0->u[0] ^ src1->u[0];
1121   dst->u[1] = src0->u[1] ^ src1->u[1];
1122   dst->u[2] = src0->u[2] ^ src1->u[2];
1123   dst->u[3] = src0->u[3] ^ src1->u[3];
1124}
1125
1126static void
1127fetch_src_file_channel(
1128   const struct tgsi_exec_machine *mach,
1129   const uint file,
1130   const uint swizzle,
1131   const union tgsi_exec_channel *index,
1132   union tgsi_exec_channel *chan )
1133{
1134   switch( swizzle ) {
1135   case TGSI_SWIZZLE_X:
1136   case TGSI_SWIZZLE_Y:
1137   case TGSI_SWIZZLE_Z:
1138   case TGSI_SWIZZLE_W:
1139      switch( file ) {
1140      case TGSI_FILE_CONSTANT:
1141         assert(mach->Consts);
1142         if (index->i[0] < 0)
1143            chan->f[0] = 0.0f;
1144         else
1145            chan->f[0] = mach->Consts[index->i[0]][swizzle];
1146         if (index->i[1] < 0)
1147            chan->f[1] = 0.0f;
1148         else
1149            chan->f[1] = mach->Consts[index->i[1]][swizzle];
1150         if (index->i[2] < 0)
1151            chan->f[2] = 0.0f;
1152         else
1153            chan->f[2] = mach->Consts[index->i[2]][swizzle];
1154         if (index->i[3] < 0)
1155            chan->f[3] = 0.0f;
1156         else
1157            chan->f[3] = mach->Consts[index->i[3]][swizzle];
1158         break;
1159
1160      case TGSI_FILE_INPUT:
1161         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
1162         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
1163         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
1164         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
1165         break;
1166
1167      case TGSI_FILE_TEMPORARY:
1168         assert(index->i[0] < TGSI_EXEC_NUM_TEMPS);
1169         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
1170         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
1171         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
1172         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
1173         break;
1174
1175      case TGSI_FILE_IMMEDIATE:
1176         assert( index->i[0] < (int) mach->ImmLimit );
1177         chan->f[0] = mach->Imms[index->i[0]][swizzle];
1178         assert( index->i[1] < (int) mach->ImmLimit );
1179         chan->f[1] = mach->Imms[index->i[1]][swizzle];
1180         assert( index->i[2] < (int) mach->ImmLimit );
1181         chan->f[2] = mach->Imms[index->i[2]][swizzle];
1182         assert( index->i[3] < (int) mach->ImmLimit );
1183         chan->f[3] = mach->Imms[index->i[3]][swizzle];
1184         break;
1185
1186      case TGSI_FILE_ADDRESS:
1187         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
1188         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
1189         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
1190         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
1191         break;
1192
1193      case TGSI_FILE_PREDICATE:
1194         assert(index->i[0] < TGSI_EXEC_NUM_PREDS);
1195         assert(index->i[1] < TGSI_EXEC_NUM_PREDS);
1196         assert(index->i[2] < TGSI_EXEC_NUM_PREDS);
1197         assert(index->i[3] < TGSI_EXEC_NUM_PREDS);
1198         chan->u[0] = mach->Predicates[0].xyzw[swizzle].u[0];
1199         chan->u[1] = mach->Predicates[0].xyzw[swizzle].u[1];
1200         chan->u[2] = mach->Predicates[0].xyzw[swizzle].u[2];
1201         chan->u[3] = mach->Predicates[0].xyzw[swizzle].u[3];
1202         break;
1203
1204      case TGSI_FILE_OUTPUT:
1205         /* vertex/fragment output vars can be read too */
1206         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
1207         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
1208         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
1209         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
1210         break;
1211
1212      default:
1213         assert( 0 );
1214      }
1215      break;
1216
1217   default:
1218      assert( 0 );
1219   }
1220}
1221
1222static void
1223fetch_source(
1224   const struct tgsi_exec_machine *mach,
1225   union tgsi_exec_channel *chan,
1226   const struct tgsi_full_src_register *reg,
1227   const uint chan_index )
1228{
1229   union tgsi_exec_channel index;
1230   uint swizzle;
1231
1232   /* We start with a direct index into a register file.
1233    *
1234    *    file[1],
1235    *    where:
1236    *       file = SrcRegister.File
1237    *       [1] = SrcRegister.Index
1238    */
1239   index.i[0] =
1240   index.i[1] =
1241   index.i[2] =
1242   index.i[3] = reg->SrcRegister.Index;
1243
1244   /* There is an extra source register that indirectly subscripts
1245    * a register file. The direct index now becomes an offset
1246    * that is being added to the indirect register.
1247    *
1248    *    file[ind[2].x+1],
1249    *    where:
1250    *       ind = SrcRegisterInd.File
1251    *       [2] = SrcRegisterInd.Index
1252    *       .x = SrcRegisterInd.SwizzleX
1253    */
1254   if (reg->SrcRegister.Indirect) {
1255      union tgsi_exec_channel index2;
1256      union tgsi_exec_channel indir_index;
1257      const uint execmask = mach->ExecMask;
1258      uint i;
1259
1260      /* which address register (always zero now) */
1261      index2.i[0] =
1262      index2.i[1] =
1263      index2.i[2] =
1264      index2.i[3] = reg->SrcRegisterInd.Index;
1265
1266      /* get current value of address register[swizzle] */
1267      swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterInd, CHAN_X );
1268      fetch_src_file_channel(
1269         mach,
1270         reg->SrcRegisterInd.File,
1271         swizzle,
1272         &index2,
1273         &indir_index );
1274
1275      /* add value of address register to the offset */
1276      index.i[0] += (int) indir_index.f[0];
1277      index.i[1] += (int) indir_index.f[1];
1278      index.i[2] += (int) indir_index.f[2];
1279      index.i[3] += (int) indir_index.f[3];
1280
1281      /* for disabled execution channels, zero-out the index to
1282       * avoid using a potential garbage value.
1283       */
1284      for (i = 0; i < QUAD_SIZE; i++) {
1285         if ((execmask & (1 << i)) == 0)
1286            index.i[i] = 0;
1287      }
1288   }
1289
1290   /* There is an extra source register that is a second
1291    * subscript to a register file. Effectively it means that
1292    * the register file is actually a 2D array of registers.
1293    *
1294    *    file[1][3] == file[1*sizeof(file[1])+3],
1295    *    where:
1296    *       [3] = SrcRegisterDim.Index
1297    */
1298   if (reg->SrcRegister.Dimension) {
1299      /* The size of the first-order array depends on the register file type.
1300       * We need to multiply the index to the first array to get an effective,
1301       * "flat" index that points to the beginning of the second-order array.
1302       */
1303      switch (reg->SrcRegister.File) {
1304      case TGSI_FILE_INPUT:
1305         index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1306         index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1307         index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1308         index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1309         break;
1310      case TGSI_FILE_CONSTANT:
1311         index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
1312         index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
1313         index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
1314         index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
1315         break;
1316      default:
1317         assert( 0 );
1318      }
1319
1320      index.i[0] += reg->SrcRegisterDim.Index;
1321      index.i[1] += reg->SrcRegisterDim.Index;
1322      index.i[2] += reg->SrcRegisterDim.Index;
1323      index.i[3] += reg->SrcRegisterDim.Index;
1324
1325      /* Again, the second subscript index can be addressed indirectly
1326       * identically to the first one.
1327       * Nothing stops us from indirectly addressing the indirect register,
1328       * but there is no need for that, so we won't exercise it.
1329       *
1330       *    file[1][ind[4].y+3],
1331       *    where:
1332       *       ind = SrcRegisterDimInd.File
1333       *       [4] = SrcRegisterDimInd.Index
1334       *       .y = SrcRegisterDimInd.SwizzleX
1335       */
1336      if (reg->SrcRegisterDim.Indirect) {
1337         union tgsi_exec_channel index2;
1338         union tgsi_exec_channel indir_index;
1339         const uint execmask = mach->ExecMask;
1340         uint i;
1341
1342         index2.i[0] =
1343         index2.i[1] =
1344         index2.i[2] =
1345         index2.i[3] = reg->SrcRegisterDimInd.Index;
1346
1347         swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
1348         fetch_src_file_channel(
1349            mach,
1350            reg->SrcRegisterDimInd.File,
1351            swizzle,
1352            &index2,
1353            &indir_index );
1354
1355         index.i[0] += (int) indir_index.f[0];
1356         index.i[1] += (int) indir_index.f[1];
1357         index.i[2] += (int) indir_index.f[2];
1358         index.i[3] += (int) indir_index.f[3];
1359
1360         /* for disabled execution channels, zero-out the index to
1361          * avoid using a potential garbage value.
1362          */
1363         for (i = 0; i < QUAD_SIZE; i++) {
1364            if ((execmask & (1 << i)) == 0)
1365               index.i[i] = 0;
1366         }
1367      }
1368
1369      /* If by any chance there was a need for a 3D array of register
1370       * files, we would have to check whether SrcRegisterDim is followed
1371       * by a dimension register and continue the saga.
1372       */
1373   }
1374
1375   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1376   fetch_src_file_channel(
1377      mach,
1378      reg->SrcRegister.File,
1379      swizzle,
1380      &index,
1381      chan );
1382
1383   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
1384   case TGSI_UTIL_SIGN_CLEAR:
1385      micro_abs( chan, chan );
1386      break;
1387
1388   case TGSI_UTIL_SIGN_SET:
1389      micro_abs( chan, chan );
1390      micro_neg( chan, chan );
1391      break;
1392
1393   case TGSI_UTIL_SIGN_TOGGLE:
1394      micro_neg( chan, chan );
1395      break;
1396
1397   case TGSI_UTIL_SIGN_KEEP:
1398      break;
1399   }
1400}
1401
1402static void
1403store_dest(
1404   struct tgsi_exec_machine *mach,
1405   const union tgsi_exec_channel *chan,
1406   const struct tgsi_full_dst_register *reg,
1407   const struct tgsi_full_instruction *inst,
1408   uint chan_index )
1409{
1410   uint i;
1411   union tgsi_exec_channel null;
1412   union tgsi_exec_channel *dst;
1413   uint execmask = mach->ExecMask;
1414   int offset = 0;  /* indirection offset */
1415   int index;
1416
1417#ifdef DEBUG
1418   check_inf_or_nan(chan);
1419#endif
1420
1421   /* There is an extra source register that indirectly subscripts
1422    * a register file. The direct index now becomes an offset
1423    * that is being added to the indirect register.
1424    *
1425    *    file[ind[2].x+1],
1426    *    where:
1427    *       ind = DstRegisterInd.File
1428    *       [2] = DstRegisterInd.Index
1429    *       .x = DstRegisterInd.SwizzleX
1430    */
1431   if (reg->DstRegister.Indirect) {
1432      union tgsi_exec_channel index;
1433      union tgsi_exec_channel indir_index;
1434      uint swizzle;
1435
1436      /* which address register (always zero for now) */
1437      index.i[0] =
1438      index.i[1] =
1439      index.i[2] =
1440      index.i[3] = reg->DstRegisterInd.Index;
1441
1442      /* get current value of address register[swizzle] */
1443      swizzle = tgsi_util_get_src_register_swizzle( &reg->DstRegisterInd, CHAN_X );
1444
1445      /* fetch values from the address/indirection register */
1446      fetch_src_file_channel(
1447         mach,
1448         reg->DstRegisterInd.File,
1449         swizzle,
1450         &index,
1451         &indir_index );
1452
1453      /* save indirection offset */
1454      offset = (int) indir_index.f[0];
1455   }
1456
1457   switch (reg->DstRegister.File) {
1458   case TGSI_FILE_NULL:
1459      dst = &null;
1460      break;
1461
1462   case TGSI_FILE_OUTPUT:
1463      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1464         + reg->DstRegister.Index;
1465      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1466      break;
1467
1468   case TGSI_FILE_TEMPORARY:
1469      index = reg->DstRegister.Index;
1470      assert( index < TGSI_EXEC_NUM_TEMPS );
1471      dst = &mach->Temps[offset + index].xyzw[chan_index];
1472      break;
1473
1474   case TGSI_FILE_ADDRESS:
1475      index = reg->DstRegister.Index;
1476      dst = &mach->Addrs[index].xyzw[chan_index];
1477      break;
1478
1479   case TGSI_FILE_LOOP:
1480      assert(reg->DstRegister.Index == 0);
1481      assert(mach->LoopCounterStackTop > 0);
1482      assert(chan_index == CHAN_X);
1483      dst = &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[chan_index];
1484      break;
1485
1486   case TGSI_FILE_PREDICATE:
1487      index = reg->DstRegister.Index;
1488      assert(index < TGSI_EXEC_NUM_PREDS);
1489      dst = &mach->Predicates[index].xyzw[chan_index];
1490      break;
1491
1492   default:
1493      assert( 0 );
1494      return;
1495   }
1496
1497   if (inst->Instruction.Predicate) {
1498      uint swizzle;
1499      union tgsi_exec_channel *pred;
1500
1501      switch (chan_index) {
1502      case CHAN_X:
1503         swizzle = inst->InstructionPredicate.SwizzleX;
1504         break;
1505      case CHAN_Y:
1506         swizzle = inst->InstructionPredicate.SwizzleY;
1507         break;
1508      case CHAN_Z:
1509         swizzle = inst->InstructionPredicate.SwizzleZ;
1510         break;
1511      case CHAN_W:
1512         swizzle = inst->InstructionPredicate.SwizzleW;
1513         break;
1514      default:
1515         assert(0);
1516         return;
1517      }
1518
1519      assert(inst->InstructionPredicate.Index == 0);
1520
1521      pred = &mach->Predicates[inst->InstructionPredicate.Index].xyzw[swizzle];
1522
1523      if (inst->InstructionPredicate.Negate) {
1524         for (i = 0; i < QUAD_SIZE; i++) {
1525            if (pred->u[i]) {
1526               execmask &= ~(1 << i);
1527            }
1528         }
1529      } else {
1530         for (i = 0; i < QUAD_SIZE; i++) {
1531            if (!pred->u[i]) {
1532               execmask &= ~(1 << i);
1533            }
1534         }
1535      }
1536   }
1537
1538   switch (inst->Instruction.Saturate) {
1539   case TGSI_SAT_NONE:
1540      for (i = 0; i < QUAD_SIZE; i++)
1541         if (execmask & (1 << i))
1542            dst->i[i] = chan->i[i];
1543      break;
1544
1545   case TGSI_SAT_ZERO_ONE:
1546      for (i = 0; i < QUAD_SIZE; i++)
1547         if (execmask & (1 << i)) {
1548            if (chan->f[i] < 0.0f)
1549               dst->f[i] = 0.0f;
1550            else if (chan->f[i] > 1.0f)
1551               dst->f[i] = 1.0f;
1552            else
1553               dst->i[i] = chan->i[i];
1554         }
1555      break;
1556
1557   case TGSI_SAT_MINUS_PLUS_ONE:
1558      for (i = 0; i < QUAD_SIZE; i++)
1559         if (execmask & (1 << i)) {
1560            if (chan->f[i] < -1.0f)
1561               dst->f[i] = -1.0f;
1562            else if (chan->f[i] > 1.0f)
1563               dst->f[i] = 1.0f;
1564            else
1565               dst->i[i] = chan->i[i];
1566         }
1567      break;
1568
1569   default:
1570      assert( 0 );
1571   }
1572}
1573
1574#define FETCH(VAL,INDEX,CHAN)\
1575    fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
1576
1577#define STORE(VAL,INDEX,CHAN)\
1578    store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
1579
1580
1581/**
1582 * Execute ARB-style KIL which is predicated by a src register.
1583 * Kill fragment if any of the four values is less than zero.
1584 */
1585static void
1586exec_kil(struct tgsi_exec_machine *mach,
1587         const struct tgsi_full_instruction *inst)
1588{
1589   uint uniquemask;
1590   uint chan_index;
1591   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1592   union tgsi_exec_channel r[1];
1593
1594   /* This mask stores component bits that were already tested. */
1595   uniquemask = 0;
1596
1597   for (chan_index = 0; chan_index < 4; chan_index++)
1598   {
1599      uint swizzle;
1600      uint i;
1601
1602      /* unswizzle channel */
1603      swizzle = tgsi_util_get_full_src_register_swizzle (
1604                        &inst->FullSrcRegisters[0],
1605                        chan_index);
1606
1607      /* check if the component has not been already tested */
1608      if (uniquemask & (1 << swizzle))
1609         continue;
1610      uniquemask |= 1 << swizzle;
1611
1612      FETCH(&r[0], 0, chan_index);
1613      for (i = 0; i < 4; i++)
1614         if (r[0].f[i] < 0.0f)
1615            kilmask |= 1 << i;
1616   }
1617
1618   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1619}
1620
1621/**
1622 * Execute NVIDIA-style KIL which is predicated by a condition code.
1623 * Kill fragment if the condition code is TRUE.
1624 */
1625static void
1626exec_kilp(struct tgsi_exec_machine *mach,
1627          const struct tgsi_full_instruction *inst)
1628{
1629   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1630
1631   /* "unconditional" kil */
1632   kilmask = mach->ExecMask;
1633   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1634}
1635
1636
1637/*
1638 * Fetch a four texture samples using STR texture coordinates.
1639 */
1640static void
1641fetch_texel( struct tgsi_sampler *sampler,
1642             const union tgsi_exec_channel *s,
1643             const union tgsi_exec_channel *t,
1644             const union tgsi_exec_channel *p,
1645             float lodbias,  /* XXX should be float[4] */
1646             union tgsi_exec_channel *r,
1647             union tgsi_exec_channel *g,
1648             union tgsi_exec_channel *b,
1649             union tgsi_exec_channel *a )
1650{
1651   uint j;
1652   float rgba[NUM_CHANNELS][QUAD_SIZE];
1653
1654   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
1655
1656   for (j = 0; j < 4; j++) {
1657      r->f[j] = rgba[0][j];
1658      g->f[j] = rgba[1][j];
1659      b->f[j] = rgba[2][j];
1660      a->f[j] = rgba[3][j];
1661   }
1662}
1663
1664
1665static void
1666exec_tex(struct tgsi_exec_machine *mach,
1667         const struct tgsi_full_instruction *inst,
1668         boolean biasLod,
1669         boolean projected)
1670{
1671   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
1672   union tgsi_exec_channel r[4];
1673   uint chan_index;
1674   float lodBias;
1675
1676   /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
1677
1678   switch (inst->InstructionTexture.Texture) {
1679   case TGSI_TEXTURE_1D:
1680   case TGSI_TEXTURE_SHADOW1D:
1681
1682      FETCH(&r[0], 0, CHAN_X);
1683
1684      if (projected) {
1685         FETCH(&r[1], 0, CHAN_W);
1686         micro_div( &r[0], &r[0], &r[1] );
1687      }
1688
1689      if (biasLod) {
1690         FETCH(&r[1], 0, CHAN_W);
1691         lodBias = r[2].f[0];
1692      }
1693      else
1694         lodBias = 0.0;
1695
1696      fetch_texel(mach->Samplers[unit],
1697                  &r[0], &ZeroVec, &ZeroVec, lodBias,  /* S, T, P, BIAS */
1698                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
1699      break;
1700
1701   case TGSI_TEXTURE_2D:
1702   case TGSI_TEXTURE_RECT:
1703   case TGSI_TEXTURE_SHADOW2D:
1704   case TGSI_TEXTURE_SHADOWRECT:
1705
1706      FETCH(&r[0], 0, CHAN_X);
1707      FETCH(&r[1], 0, CHAN_Y);
1708      FETCH(&r[2], 0, CHAN_Z);
1709
1710      if (projected) {
1711         FETCH(&r[3], 0, CHAN_W);
1712         micro_div( &r[0], &r[0], &r[3] );
1713         micro_div( &r[1], &r[1], &r[3] );
1714         micro_div( &r[2], &r[2], &r[3] );
1715      }
1716
1717      if (biasLod) {
1718         FETCH(&r[3], 0, CHAN_W);
1719         lodBias = r[3].f[0];
1720      }
1721      else
1722         lodBias = 0.0;
1723
1724      fetch_texel(mach->Samplers[unit],
1725                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
1726                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1727      break;
1728
1729   case TGSI_TEXTURE_3D:
1730   case TGSI_TEXTURE_CUBE:
1731
1732      FETCH(&r[0], 0, CHAN_X);
1733      FETCH(&r[1], 0, CHAN_Y);
1734      FETCH(&r[2], 0, CHAN_Z);
1735
1736      if (projected) {
1737         FETCH(&r[3], 0, CHAN_W);
1738         micro_div( &r[0], &r[0], &r[3] );
1739         micro_div( &r[1], &r[1], &r[3] );
1740         micro_div( &r[2], &r[2], &r[3] );
1741      }
1742
1743      if (biasLod) {
1744         FETCH(&r[3], 0, CHAN_W);
1745         lodBias = r[3].f[0];
1746      }
1747      else
1748         lodBias = 0.0;
1749
1750      fetch_texel(mach->Samplers[unit],
1751                  &r[0], &r[1], &r[2], lodBias,
1752                  &r[0], &r[1], &r[2], &r[3]);
1753      break;
1754
1755   default:
1756      assert (0);
1757   }
1758
1759   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1760      STORE( &r[chan_index], 0, chan_index );
1761   }
1762}
1763
1764static void
1765exec_txd(struct tgsi_exec_machine *mach,
1766         const struct tgsi_full_instruction *inst)
1767{
1768   const uint unit = inst->FullSrcRegisters[3].SrcRegister.Index;
1769   union tgsi_exec_channel r[4];
1770   uint chan_index;
1771
1772   /*
1773    * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1774    */
1775
1776   switch (inst->InstructionTexture.Texture) {
1777   case TGSI_TEXTURE_1D:
1778   case TGSI_TEXTURE_SHADOW1D:
1779
1780      FETCH(&r[0], 0, CHAN_X);
1781
1782      fetch_texel(mach->Samplers[unit],
1783                  &r[0], &ZeroVec, &ZeroVec, 0.0f,  /* S, T, P, BIAS */
1784                  &r[0], &r[1], &r[2], &r[3]);      /* R, G, B, A */
1785      break;
1786
1787   case TGSI_TEXTURE_2D:
1788   case TGSI_TEXTURE_RECT:
1789   case TGSI_TEXTURE_SHADOW2D:
1790   case TGSI_TEXTURE_SHADOWRECT:
1791
1792      FETCH(&r[0], 0, CHAN_X);
1793      FETCH(&r[1], 0, CHAN_Y);
1794      FETCH(&r[2], 0, CHAN_Z);
1795
1796      fetch_texel(mach->Samplers[unit],
1797                  &r[0], &r[1], &r[2], 0.0f,    /* inputs */
1798                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1799      break;
1800
1801   case TGSI_TEXTURE_3D:
1802   case TGSI_TEXTURE_CUBE:
1803
1804      FETCH(&r[0], 0, CHAN_X);
1805      FETCH(&r[1], 0, CHAN_Y);
1806      FETCH(&r[2], 0, CHAN_Z);
1807
1808      fetch_texel(mach->Samplers[unit],
1809                  &r[0], &r[1], &r[2], 0.0f,
1810                  &r[0], &r[1], &r[2], &r[3]);
1811      break;
1812
1813   default:
1814      assert(0);
1815   }
1816
1817   FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1818      STORE(&r[chan_index], 0, chan_index);
1819   }
1820}
1821
1822
1823/**
1824 * Evaluate a constant-valued coefficient at the position of the
1825 * current quad.
1826 */
1827static void
1828eval_constant_coef(
1829   struct tgsi_exec_machine *mach,
1830   unsigned attrib,
1831   unsigned chan )
1832{
1833   unsigned i;
1834
1835   for( i = 0; i < QUAD_SIZE; i++ ) {
1836      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1837   }
1838}
1839
1840/**
1841 * Evaluate a linear-valued coefficient at the position of the
1842 * current quad.
1843 */
1844static void
1845eval_linear_coef(
1846   struct tgsi_exec_machine *mach,
1847   unsigned attrib,
1848   unsigned chan )
1849{
1850   const float x = mach->QuadPos.xyzw[0].f[0];
1851   const float y = mach->QuadPos.xyzw[1].f[0];
1852   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1853   const float dady = mach->InterpCoefs[attrib].dady[chan];
1854   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1855   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1856   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1857   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1858   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1859}
1860
1861/**
1862 * Evaluate a perspective-valued coefficient at the position of the
1863 * current quad.
1864 */
1865static void
1866eval_perspective_coef(
1867   struct tgsi_exec_machine *mach,
1868   unsigned attrib,
1869   unsigned chan )
1870{
1871   const float x = mach->QuadPos.xyzw[0].f[0];
1872   const float y = mach->QuadPos.xyzw[1].f[0];
1873   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1874   const float dady = mach->InterpCoefs[attrib].dady[chan];
1875   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1876   const float *w = mach->QuadPos.xyzw[3].f;
1877   /* divide by W here */
1878   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1879   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1880   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1881   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1882}
1883
1884
1885typedef void (* eval_coef_func)(
1886   struct tgsi_exec_machine *mach,
1887   unsigned attrib,
1888   unsigned chan );
1889
1890static void
1891exec_declaration(struct tgsi_exec_machine *mach,
1892                 const struct tgsi_full_declaration *decl)
1893{
1894   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
1895      if (decl->Declaration.File == TGSI_FILE_INPUT) {
1896         uint first, last, mask;
1897
1898         first = decl->DeclarationRange.First;
1899         last = decl->DeclarationRange.Last;
1900         mask = decl->Declaration.UsageMask;
1901
1902         if (decl->Semantic.SemanticName == TGSI_SEMANTIC_POSITION) {
1903            assert(decl->Semantic.SemanticIndex == 0);
1904            assert(first == last);
1905            assert(mask = TGSI_WRITEMASK_XYZW);
1906
1907            mach->Inputs[first] = mach->QuadPos;
1908         } else if (decl->Semantic.SemanticName == TGSI_SEMANTIC_FACE) {
1909            uint i;
1910
1911            assert(decl->Semantic.SemanticIndex == 0);
1912            assert(first == last);
1913
1914            for (i = 0; i < QUAD_SIZE; i++) {
1915               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
1916            }
1917         } else {
1918            eval_coef_func eval;
1919            uint i, j;
1920
1921            switch (decl->Declaration.Interpolate) {
1922            case TGSI_INTERPOLATE_CONSTANT:
1923               eval = eval_constant_coef;
1924               break;
1925
1926            case TGSI_INTERPOLATE_LINEAR:
1927               eval = eval_linear_coef;
1928               break;
1929
1930            case TGSI_INTERPOLATE_PERSPECTIVE:
1931               eval = eval_perspective_coef;
1932               break;
1933
1934            default:
1935               assert(0);
1936               return;
1937            }
1938
1939            for (j = 0; j < NUM_CHANNELS; j++) {
1940               if (mask & (1 << j)) {
1941                  for (i = first; i <= last; i++) {
1942                     eval(mach, i, j);
1943                  }
1944               }
1945            }
1946         }
1947      }
1948   }
1949}
1950
1951static void
1952exec_instruction(
1953   struct tgsi_exec_machine *mach,
1954   const struct tgsi_full_instruction *inst,
1955   int *pc )
1956{
1957   uint chan_index;
1958   union tgsi_exec_channel r[10];
1959   union tgsi_exec_channel d[8];
1960
1961   (*pc)++;
1962
1963   switch (inst->Instruction.Opcode) {
1964   case TGSI_OPCODE_ARL:
1965   case TGSI_OPCODE_FLR:
1966      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1967         FETCH( &r[0], 0, chan_index );
1968         micro_flr(&d[chan_index], &r[0]);
1969      }
1970      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1971         STORE(&d[chan_index], 0, chan_index);
1972      }
1973      break;
1974
1975   case TGSI_OPCODE_MOV:
1976      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1977         FETCH(&d[chan_index], 0, chan_index);
1978      }
1979      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1980         STORE(&d[chan_index], 0, chan_index);
1981      }
1982      break;
1983
1984   case TGSI_OPCODE_LIT:
1985      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1986         FETCH( &r[0], 0, CHAN_X );
1987         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1988            micro_max(&d[CHAN_Y], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
1989         }
1990
1991         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1992            FETCH( &r[1], 0, CHAN_Y );
1993            micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1994
1995            FETCH( &r[2], 0, CHAN_W );
1996            micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
1997            micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
1998            micro_pow( &r[1], &r[1], &r[2] );
1999            micro_lt(&d[CHAN_Z], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2000         }
2001
2002         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2003            STORE(&d[CHAN_Y], 0, CHAN_Y);
2004         }
2005         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2006            STORE(&d[CHAN_Z], 0, CHAN_Z);
2007         }
2008      }
2009      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2010         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2011      }
2012      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2013         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2014      }
2015      break;
2016
2017   case TGSI_OPCODE_RCP:
2018   /* TGSI_OPCODE_RECIP */
2019      FETCH( &r[0], 0, CHAN_X );
2020      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
2021      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2022         STORE( &r[0], 0, chan_index );
2023      }
2024      break;
2025
2026   case TGSI_OPCODE_RSQ:
2027   /* TGSI_OPCODE_RECIPSQRT */
2028      FETCH( &r[0], 0, CHAN_X );
2029      micro_abs( &r[0], &r[0] );
2030      micro_sqrt( &r[0], &r[0] );
2031      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
2032      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2033         STORE( &r[0], 0, chan_index );
2034      }
2035      break;
2036
2037   case TGSI_OPCODE_EXP:
2038      FETCH( &r[0], 0, CHAN_X );
2039      micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
2040      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2041         micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
2042         STORE( &r[2], 0, CHAN_X );        /* store r2 */
2043      }
2044      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2045         micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
2046         STORE( &r[2], 0, CHAN_Y );        /* store r2 */
2047      }
2048      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2049         micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
2050         STORE( &r[2], 0, CHAN_Z );        /* store r2 */
2051      }
2052      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2053         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2054      }
2055      break;
2056
2057   case TGSI_OPCODE_LOG:
2058      FETCH( &r[0], 0, CHAN_X );
2059      micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
2060      micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
2061      micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
2062      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2063         STORE( &r[0], 0, CHAN_X );
2064      }
2065      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2066         micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
2067         micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
2068         STORE( &r[0], 0, CHAN_Y );
2069      }
2070      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2071         STORE( &r[1], 0, CHAN_Z );
2072      }
2073      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2074         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2075      }
2076      break;
2077
2078   case TGSI_OPCODE_MUL:
2079      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2080         FETCH(&r[0], 0, chan_index);
2081         FETCH(&r[1], 1, chan_index);
2082         micro_mul(&d[chan_index], &r[0], &r[1]);
2083      }
2084      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2085         STORE(&d[chan_index], 0, chan_index);
2086      }
2087      break;
2088
2089   case TGSI_OPCODE_ADD:
2090      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2091         FETCH( &r[0], 0, chan_index );
2092         FETCH( &r[1], 1, chan_index );
2093         micro_add(&d[chan_index], &r[0], &r[1]);
2094      }
2095      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2096         STORE(&d[chan_index], 0, chan_index);
2097      }
2098      break;
2099
2100   case TGSI_OPCODE_DP3:
2101   /* TGSI_OPCODE_DOT3 */
2102      FETCH( &r[0], 0, CHAN_X );
2103      FETCH( &r[1], 1, CHAN_X );
2104      micro_mul( &r[0], &r[0], &r[1] );
2105
2106      FETCH( &r[1], 0, CHAN_Y );
2107      FETCH( &r[2], 1, CHAN_Y );
2108      micro_mul( &r[1], &r[1], &r[2] );
2109      micro_add( &r[0], &r[0], &r[1] );
2110
2111      FETCH( &r[1], 0, CHAN_Z );
2112      FETCH( &r[2], 1, CHAN_Z );
2113      micro_mul( &r[1], &r[1], &r[2] );
2114      micro_add( &r[0], &r[0], &r[1] );
2115
2116      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2117         STORE( &r[0], 0, chan_index );
2118      }
2119      break;
2120
2121    case TGSI_OPCODE_DP4:
2122    /* TGSI_OPCODE_DOT4 */
2123       FETCH(&r[0], 0, CHAN_X);
2124       FETCH(&r[1], 1, CHAN_X);
2125
2126       micro_mul( &r[0], &r[0], &r[1] );
2127
2128       FETCH(&r[1], 0, CHAN_Y);
2129       FETCH(&r[2], 1, CHAN_Y);
2130
2131       micro_mul( &r[1], &r[1], &r[2] );
2132       micro_add( &r[0], &r[0], &r[1] );
2133
2134       FETCH(&r[1], 0, CHAN_Z);
2135       FETCH(&r[2], 1, CHAN_Z);
2136
2137       micro_mul( &r[1], &r[1], &r[2] );
2138       micro_add( &r[0], &r[0], &r[1] );
2139
2140       FETCH(&r[1], 0, CHAN_W);
2141       FETCH(&r[2], 1, CHAN_W);
2142
2143       micro_mul( &r[1], &r[1], &r[2] );
2144       micro_add( &r[0], &r[0], &r[1] );
2145
2146      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2147         STORE( &r[0], 0, chan_index );
2148      }
2149      break;
2150
2151   case TGSI_OPCODE_DST:
2152      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2153         FETCH( &r[0], 0, CHAN_Y );
2154         FETCH( &r[1], 1, CHAN_Y);
2155         micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2156      }
2157      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2158         FETCH(&d[CHAN_Z], 0, CHAN_Z);
2159      }
2160      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2161         FETCH(&d[CHAN_W], 1, CHAN_W);
2162      }
2163
2164      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2165         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X);
2166      }
2167      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2168         STORE(&d[CHAN_Y], 0, CHAN_Y);
2169      }
2170      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2171         STORE(&d[CHAN_Z], 0, CHAN_Z);
2172      }
2173      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2174         STORE(&d[CHAN_W], 0, CHAN_W);
2175      }
2176      break;
2177
2178   case TGSI_OPCODE_MIN:
2179      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2180         FETCH(&r[0], 0, chan_index);
2181         FETCH(&r[1], 1, chan_index);
2182
2183         /* XXX use micro_min()?? */
2184         micro_lt(&d[chan_index], &r[0], &r[1], &r[0], &r[1]);
2185      }
2186      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2187         STORE(&d[chan_index], 0, chan_index);
2188      }
2189      break;
2190
2191   case TGSI_OPCODE_MAX:
2192      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2193         FETCH(&r[0], 0, chan_index);
2194         FETCH(&r[1], 1, chan_index);
2195
2196         /* XXX use micro_max()?? */
2197         micro_lt(&d[chan_index], &r[0], &r[1], &r[1], &r[0] );
2198      }
2199      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2200         STORE(&d[chan_index], 0, chan_index);
2201      }
2202      break;
2203
2204   case TGSI_OPCODE_SLT:
2205   /* TGSI_OPCODE_SETLT */
2206      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2207         FETCH( &r[0], 0, chan_index );
2208         FETCH( &r[1], 1, chan_index );
2209         micro_lt(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2210      }
2211      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2212         STORE(&d[chan_index], 0, chan_index);
2213      }
2214      break;
2215
2216   case TGSI_OPCODE_SGE:
2217   /* TGSI_OPCODE_SETGE */
2218      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2219         FETCH( &r[0], 0, chan_index );
2220         FETCH( &r[1], 1, chan_index );
2221         micro_le(&d[chan_index], &r[1], &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2222      }
2223      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2224         STORE(&d[chan_index], 0, chan_index);
2225      }
2226      break;
2227
2228   case TGSI_OPCODE_MAD:
2229   /* TGSI_OPCODE_MADD */
2230      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2231         FETCH( &r[0], 0, chan_index );
2232         FETCH( &r[1], 1, chan_index );
2233         micro_mul( &r[0], &r[0], &r[1] );
2234         FETCH( &r[1], 2, chan_index );
2235         micro_add(&d[chan_index], &r[0], &r[1]);
2236      }
2237      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2238         STORE(&d[chan_index], 0, chan_index);
2239      }
2240      break;
2241
2242   case TGSI_OPCODE_SUB:
2243      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2244         FETCH(&r[0], 0, chan_index);
2245         FETCH(&r[1], 1, chan_index);
2246         micro_sub(&d[chan_index], &r[0], &r[1]);
2247      }
2248      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2249         STORE(&d[chan_index], 0, chan_index);
2250      }
2251      break;
2252
2253   case TGSI_OPCODE_LRP:
2254      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2255         FETCH(&r[0], 0, chan_index);
2256         FETCH(&r[1], 1, chan_index);
2257         FETCH(&r[2], 2, chan_index);
2258         micro_sub( &r[1], &r[1], &r[2] );
2259         micro_mul( &r[0], &r[0], &r[1] );
2260         micro_add(&d[chan_index], &r[0], &r[2]);
2261      }
2262      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2263         STORE(&d[chan_index], 0, chan_index);
2264      }
2265      break;
2266
2267   case TGSI_OPCODE_CND:
2268      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2269         FETCH(&r[0], 0, chan_index);
2270         FETCH(&r[1], 1, chan_index);
2271         FETCH(&r[2], 2, chan_index);
2272         micro_lt(&d[chan_index], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]);
2273      }
2274      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2275         STORE(&d[chan_index], 0, chan_index);
2276      }
2277      break;
2278
2279   case TGSI_OPCODE_DP2A:
2280      FETCH( &r[0], 0, CHAN_X );
2281      FETCH( &r[1], 1, CHAN_X );
2282      micro_mul( &r[0], &r[0], &r[1] );
2283
2284      FETCH( &r[1], 0, CHAN_Y );
2285      FETCH( &r[2], 1, CHAN_Y );
2286      micro_mul( &r[1], &r[1], &r[2] );
2287      micro_add( &r[0], &r[0], &r[1] );
2288
2289      FETCH( &r[2], 2, CHAN_X );
2290      micro_add( &r[0], &r[0], &r[2] );
2291
2292      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2293         STORE( &r[0], 0, chan_index );
2294      }
2295      break;
2296
2297   case TGSI_OPCODE_FRC:
2298      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2299         FETCH( &r[0], 0, chan_index );
2300         micro_frc(&d[chan_index], &r[0]);
2301      }
2302      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2303         STORE(&d[chan_index], 0, chan_index);
2304      }
2305      break;
2306
2307   case TGSI_OPCODE_CLAMP:
2308      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2309         FETCH(&r[0], 0, chan_index);
2310         FETCH(&r[1], 1, chan_index);
2311         micro_max(&r[0], &r[0], &r[1]);
2312         FETCH(&r[1], 2, chan_index);
2313         micro_min(&d[chan_index], &r[0], &r[1]);
2314      }
2315      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2316         STORE(&d[chan_index], 0, chan_index);
2317      }
2318      break;
2319
2320   case TGSI_OPCODE_ROUND:
2321   case TGSI_OPCODE_ARR:
2322      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2323         FETCH( &r[0], 0, chan_index );
2324         micro_rnd(&d[chan_index], &r[0]);
2325      }
2326      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2327         STORE(&d[chan_index], 0, chan_index);
2328      }
2329      break;
2330
2331   case TGSI_OPCODE_EX2:
2332      FETCH(&r[0], 0, CHAN_X);
2333
2334      micro_exp2( &r[0], &r[0] );
2335
2336      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2337         STORE( &r[0], 0, chan_index );
2338      }
2339      break;
2340
2341   case TGSI_OPCODE_LG2:
2342      FETCH( &r[0], 0, CHAN_X );
2343      micro_lg2( &r[0], &r[0] );
2344      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2345         STORE( &r[0], 0, chan_index );
2346      }
2347      break;
2348
2349   case TGSI_OPCODE_POW:
2350      FETCH(&r[0], 0, CHAN_X);
2351      FETCH(&r[1], 1, CHAN_X);
2352
2353      micro_pow( &r[0], &r[0], &r[1] );
2354
2355      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2356         STORE( &r[0], 0, chan_index );
2357      }
2358      break;
2359
2360   case TGSI_OPCODE_XPD:
2361      FETCH(&r[0], 0, CHAN_Y);
2362      FETCH(&r[1], 1, CHAN_Z);
2363
2364      micro_mul( &r[2], &r[0], &r[1] );
2365
2366      FETCH(&r[3], 0, CHAN_Z);
2367      FETCH(&r[4], 1, CHAN_Y);
2368
2369      micro_mul( &r[5], &r[3], &r[4] );
2370      micro_sub(&d[CHAN_X], &r[2], &r[5]);
2371
2372      FETCH(&r[2], 1, CHAN_X);
2373
2374      micro_mul( &r[3], &r[3], &r[2] );
2375
2376      FETCH(&r[5], 0, CHAN_X);
2377
2378      micro_mul( &r[1], &r[1], &r[5] );
2379      micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2380
2381      micro_mul( &r[5], &r[5], &r[4] );
2382      micro_mul( &r[0], &r[0], &r[2] );
2383      micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2384
2385      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2386         STORE(&d[CHAN_X], 0, CHAN_X);
2387      }
2388      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2389         STORE(&d[CHAN_Y], 0, CHAN_Y);
2390      }
2391      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2392         STORE(&d[CHAN_Z], 0, CHAN_Z);
2393      }
2394      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2395         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2396      }
2397      break;
2398
2399    case TGSI_OPCODE_ABS:
2400       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2401          FETCH(&r[0], 0, chan_index);
2402          micro_abs(&d[chan_index], &r[0]);
2403       }
2404       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2405         STORE(&d[chan_index], 0, chan_index);
2406      }
2407       break;
2408
2409   case TGSI_OPCODE_RCC:
2410      FETCH(&r[0], 0, CHAN_X);
2411      micro_div(&r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0]);
2412      micro_float_clamp(&r[0], &r[0]);
2413      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2414         STORE(&r[0], 0, chan_index);
2415      }
2416      break;
2417
2418   case TGSI_OPCODE_DPH:
2419      FETCH(&r[0], 0, CHAN_X);
2420      FETCH(&r[1], 1, CHAN_X);
2421
2422      micro_mul( &r[0], &r[0], &r[1] );
2423
2424      FETCH(&r[1], 0, CHAN_Y);
2425      FETCH(&r[2], 1, CHAN_Y);
2426
2427      micro_mul( &r[1], &r[1], &r[2] );
2428      micro_add( &r[0], &r[0], &r[1] );
2429
2430      FETCH(&r[1], 0, CHAN_Z);
2431      FETCH(&r[2], 1, CHAN_Z);
2432
2433      micro_mul( &r[1], &r[1], &r[2] );
2434      micro_add( &r[0], &r[0], &r[1] );
2435
2436      FETCH(&r[1], 1, CHAN_W);
2437
2438      micro_add( &r[0], &r[0], &r[1] );
2439
2440      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2441         STORE( &r[0], 0, chan_index );
2442      }
2443      break;
2444
2445   case TGSI_OPCODE_COS:
2446      FETCH(&r[0], 0, CHAN_X);
2447
2448      micro_cos( &r[0], &r[0] );
2449
2450      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2451         STORE( &r[0], 0, chan_index );
2452      }
2453      break;
2454
2455   case TGSI_OPCODE_DDX:
2456      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2457         FETCH( &r[0], 0, chan_index );
2458         micro_ddx(&d[chan_index], &r[0]);
2459      }
2460      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2461         STORE(&d[chan_index], 0, chan_index);
2462      }
2463      break;
2464
2465   case TGSI_OPCODE_DDY:
2466      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2467         FETCH( &r[0], 0, chan_index );
2468         micro_ddy(&d[chan_index], &r[0]);
2469      }
2470      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2471         STORE(&d[chan_index], 0, chan_index);
2472      }
2473      break;
2474
2475   case TGSI_OPCODE_KILP:
2476      exec_kilp (mach, inst);
2477      break;
2478
2479   case TGSI_OPCODE_KIL:
2480      exec_kil (mach, inst);
2481      break;
2482
2483   case TGSI_OPCODE_PK2H:
2484      assert (0);
2485      break;
2486
2487   case TGSI_OPCODE_PK2US:
2488      assert (0);
2489      break;
2490
2491   case TGSI_OPCODE_PK4B:
2492      assert (0);
2493      break;
2494
2495   case TGSI_OPCODE_PK4UB:
2496      assert (0);
2497      break;
2498
2499   case TGSI_OPCODE_RFL:
2500      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2501          IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2502          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2503         /* r0 = dp3(src0, src0) */
2504         FETCH(&r[2], 0, CHAN_X);
2505         micro_mul(&r[0], &r[2], &r[2]);
2506         FETCH(&r[4], 0, CHAN_Y);
2507         micro_mul(&r[8], &r[4], &r[4]);
2508         micro_add(&r[0], &r[0], &r[8]);
2509         FETCH(&r[6], 0, CHAN_Z);
2510         micro_mul(&r[8], &r[6], &r[6]);
2511         micro_add(&r[0], &r[0], &r[8]);
2512
2513         /* r1 = dp3(src0, src1) */
2514         FETCH(&r[3], 1, CHAN_X);
2515         micro_mul(&r[1], &r[2], &r[3]);
2516         FETCH(&r[5], 1, CHAN_Y);
2517         micro_mul(&r[8], &r[4], &r[5]);
2518         micro_add(&r[1], &r[1], &r[8]);
2519         FETCH(&r[7], 1, CHAN_Z);
2520         micro_mul(&r[8], &r[6], &r[7]);
2521         micro_add(&r[1], &r[1], &r[8]);
2522
2523         /* r1 = 2 * r1 / r0 */
2524         micro_add(&r[1], &r[1], &r[1]);
2525         micro_div(&r[1], &r[1], &r[0]);
2526
2527         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2528            micro_mul(&r[2], &r[2], &r[1]);
2529            micro_sub(&r[2], &r[2], &r[3]);
2530            STORE(&r[2], 0, CHAN_X);
2531         }
2532         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2533            micro_mul(&r[4], &r[4], &r[1]);
2534            micro_sub(&r[4], &r[4], &r[5]);
2535            STORE(&r[4], 0, CHAN_Y);
2536         }
2537         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2538            micro_mul(&r[6], &r[6], &r[1]);
2539            micro_sub(&r[6], &r[6], &r[7]);
2540            STORE(&r[6], 0, CHAN_Z);
2541         }
2542      }
2543      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2544         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2545      }
2546      break;
2547
2548   case TGSI_OPCODE_SEQ:
2549      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2550         FETCH( &r[0], 0, chan_index );
2551         FETCH( &r[1], 1, chan_index );
2552         micro_eq(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2553      }
2554      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2555         STORE(&d[chan_index], 0, chan_index);
2556      }
2557      break;
2558
2559   case TGSI_OPCODE_SFL:
2560      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2561         STORE(&mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, chan_index);
2562      }
2563      break;
2564
2565   case TGSI_OPCODE_SGT:
2566      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2567         FETCH( &r[0], 0, chan_index );
2568         FETCH( &r[1], 1, chan_index );
2569         micro_le(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
2570      }
2571      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2572         STORE(&d[chan_index], 0, chan_index);
2573      }
2574      break;
2575
2576   case TGSI_OPCODE_SIN:
2577      FETCH( &r[0], 0, CHAN_X );
2578      micro_sin( &r[0], &r[0] );
2579      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2580         STORE( &r[0], 0, chan_index );
2581      }
2582      break;
2583
2584   case TGSI_OPCODE_SLE:
2585      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2586         FETCH( &r[0], 0, chan_index );
2587         FETCH( &r[1], 1, chan_index );
2588         micro_le(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2589      }
2590      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2591         STORE(&d[chan_index], 0, chan_index);
2592      }
2593      break;
2594
2595   case TGSI_OPCODE_SNE:
2596      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2597         FETCH( &r[0], 0, chan_index );
2598         FETCH( &r[1], 1, chan_index );
2599         micro_eq(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
2600      }
2601      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2602         STORE(&d[chan_index], 0, chan_index);
2603      }
2604      break;
2605
2606   case TGSI_OPCODE_STR:
2607      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2608         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, chan_index);
2609      }
2610      break;
2611
2612   case TGSI_OPCODE_TEX:
2613      /* simple texture lookup */
2614      /* src[0] = texcoord */
2615      /* src[1] = sampler unit */
2616      exec_tex(mach, inst, FALSE, FALSE);
2617      break;
2618
2619   case TGSI_OPCODE_TXB:
2620      /* Texture lookup with lod bias */
2621      /* src[0] = texcoord (src[0].w = LOD bias) */
2622      /* src[1] = sampler unit */
2623      exec_tex(mach, inst, TRUE, FALSE);
2624      break;
2625
2626   case TGSI_OPCODE_TXD:
2627      /* Texture lookup with explict partial derivatives */
2628      /* src[0] = texcoord */
2629      /* src[1] = d[strq]/dx */
2630      /* src[2] = d[strq]/dy */
2631      /* src[3] = sampler unit */
2632      exec_txd(mach, inst);
2633      break;
2634
2635   case TGSI_OPCODE_TXL:
2636      /* Texture lookup with explit LOD */
2637      /* src[0] = texcoord (src[0].w = LOD) */
2638      /* src[1] = sampler unit */
2639      exec_tex(mach, inst, TRUE, FALSE);
2640      break;
2641
2642   case TGSI_OPCODE_TXP:
2643      /* Texture lookup with projection */
2644      /* src[0] = texcoord (src[0].w = projection) */
2645      /* src[1] = sampler unit */
2646      exec_tex(mach, inst, FALSE, TRUE);
2647      break;
2648
2649   case TGSI_OPCODE_UP2H:
2650      assert (0);
2651      break;
2652
2653   case TGSI_OPCODE_UP2US:
2654      assert (0);
2655      break;
2656
2657   case TGSI_OPCODE_UP4B:
2658      assert (0);
2659      break;
2660
2661   case TGSI_OPCODE_UP4UB:
2662      assert (0);
2663      break;
2664
2665   case TGSI_OPCODE_X2D:
2666      FETCH(&r[0], 1, CHAN_X);
2667      FETCH(&r[1], 1, CHAN_Y);
2668      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2669          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2670         FETCH(&r[2], 2, CHAN_X);
2671         micro_mul(&r[2], &r[2], &r[0]);
2672         FETCH(&r[3], 2, CHAN_Y);
2673         micro_mul(&r[3], &r[3], &r[1]);
2674         micro_add(&r[2], &r[2], &r[3]);
2675         FETCH(&r[3], 0, CHAN_X);
2676         micro_add(&d[CHAN_X], &r[2], &r[3]);
2677
2678      }
2679      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2680          IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2681         FETCH(&r[2], 2, CHAN_Z);
2682         micro_mul(&r[2], &r[2], &r[0]);
2683         FETCH(&r[3], 2, CHAN_W);
2684         micro_mul(&r[3], &r[3], &r[1]);
2685         micro_add(&r[2], &r[2], &r[3]);
2686         FETCH(&r[3], 0, CHAN_Y);
2687         micro_add(&d[CHAN_Y], &r[2], &r[3]);
2688
2689      }
2690      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2691         STORE(&d[CHAN_X], 0, CHAN_X);
2692      }
2693      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2694         STORE(&d[CHAN_Y], 0, CHAN_Y);
2695      }
2696      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2697         STORE(&d[CHAN_X], 0, CHAN_Z);
2698      }
2699      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2700         STORE(&d[CHAN_Y], 0, CHAN_W);
2701      }
2702      break;
2703
2704   case TGSI_OPCODE_ARA:
2705      assert (0);
2706      break;
2707
2708   case TGSI_OPCODE_BRA:
2709      assert (0);
2710      break;
2711
2712   case TGSI_OPCODE_CAL:
2713      /* skip the call if no execution channels are enabled */
2714      if (mach->ExecMask) {
2715         /* do the call */
2716
2717         /* First, record the depths of the execution stacks.
2718          * This is important for deeply nested/looped return statements.
2719          * We have to unwind the stacks by the correct amount.  For a
2720          * real code generator, we could determine the number of entries
2721          * to pop off each stack with simple static analysis and avoid
2722          * implementing this data structure at run time.
2723          */
2724         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
2725         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
2726         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
2727         /* note that PC was already incremented above */
2728         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
2729
2730         mach->CallStackTop++;
2731
2732         /* Second, push the Cond, Loop, Cont, Func stacks */
2733         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2734         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2735         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2736         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2737         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2738         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2739         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
2740         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
2741
2742         /* Finally, jump to the subroutine */
2743         *pc = inst->InstructionLabel.Label;
2744      }
2745      break;
2746
2747   case TGSI_OPCODE_RET:
2748      mach->FuncMask &= ~mach->ExecMask;
2749      UPDATE_EXEC_MASK(mach);
2750
2751      if (mach->FuncMask == 0x0) {
2752         /* really return now (otherwise, keep executing */
2753
2754         if (mach->CallStackTop == 0) {
2755            /* returning from main() */
2756            *pc = -1;
2757            return;
2758         }
2759
2760         assert(mach->CallStackTop > 0);
2761         mach->CallStackTop--;
2762
2763         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
2764         mach->CondMask = mach->CondStack[mach->CondStackTop];
2765
2766         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
2767         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
2768
2769         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
2770         mach->ContMask = mach->ContStack[mach->ContStackTop];
2771
2772         assert(mach->FuncStackTop > 0);
2773         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
2774
2775         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
2776
2777         UPDATE_EXEC_MASK(mach);
2778      }
2779      break;
2780
2781   case TGSI_OPCODE_SSG:
2782   /* TGSI_OPCODE_SGN */
2783      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2784         FETCH( &r[0], 0, chan_index );
2785         micro_sgn(&d[chan_index], &r[0]);
2786      }
2787      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2788         STORE(&d[chan_index], 0, chan_index);
2789      }
2790      break;
2791
2792   case TGSI_OPCODE_CMP:
2793      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2794         FETCH(&r[0], 0, chan_index);
2795         FETCH(&r[1], 1, chan_index);
2796         FETCH(&r[2], 2, chan_index);
2797         micro_lt(&d[chan_index], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2]);
2798      }
2799      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2800         STORE(&d[chan_index], 0, chan_index);
2801      }
2802      break;
2803
2804   case TGSI_OPCODE_SCS:
2805      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2806         FETCH( &r[0], 0, CHAN_X );
2807         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2808            micro_cos(&r[1], &r[0]);
2809            STORE(&r[1], 0, CHAN_X);
2810         }
2811         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2812            micro_sin(&r[1], &r[0]);
2813            STORE(&r[1], 0, CHAN_Y);
2814         }
2815      }
2816      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2817         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
2818      }
2819      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
2820         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2821      }
2822      break;
2823
2824   case TGSI_OPCODE_NRM:
2825      /* 3-component vector normalize */
2826      if(IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2827         IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2828         IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2829         /* r3 = sqrt(dp3(src0, src0)) */
2830         FETCH(&r[0], 0, CHAN_X);
2831         micro_mul(&r[3], &r[0], &r[0]);
2832         FETCH(&r[1], 0, CHAN_Y);
2833         micro_mul(&r[4], &r[1], &r[1]);
2834         micro_add(&r[3], &r[3], &r[4]);
2835         FETCH(&r[2], 0, CHAN_Z);
2836         micro_mul(&r[4], &r[2], &r[2]);
2837         micro_add(&r[3], &r[3], &r[4]);
2838         micro_sqrt(&r[3], &r[3]);
2839
2840         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2841            micro_div(&r[0], &r[0], &r[3]);
2842            STORE(&r[0], 0, CHAN_X);
2843         }
2844         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2845            micro_div(&r[1], &r[1], &r[3]);
2846            STORE(&r[1], 0, CHAN_Y);
2847         }
2848         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2849            micro_div(&r[2], &r[2], &r[3]);
2850            STORE(&r[2], 0, CHAN_Z);
2851         }
2852      }
2853      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2854         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2855      }
2856      break;
2857
2858   case TGSI_OPCODE_NRM4:
2859      /* 4-component vector normalize */
2860      {
2861         union tgsi_exec_channel tmp, dot;
2862
2863         /* tmp = dp4(src0, src0): */
2864         FETCH( &r[0], 0, CHAN_X );
2865         micro_mul( &tmp, &r[0], &r[0] );
2866
2867         FETCH( &r[1], 0, CHAN_Y );
2868         micro_mul( &dot, &r[1], &r[1] );
2869         micro_add( &tmp, &tmp, &dot );
2870
2871         FETCH( &r[2], 0, CHAN_Z );
2872         micro_mul( &dot, &r[2], &r[2] );
2873         micro_add( &tmp, &tmp, &dot );
2874
2875         FETCH( &r[3], 0, CHAN_W );
2876         micro_mul( &dot, &r[3], &r[3] );
2877         micro_add( &tmp, &tmp, &dot );
2878
2879         /* tmp = 1 / sqrt(tmp) */
2880         micro_sqrt( &tmp, &tmp );
2881         micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
2882
2883         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2884            /* chan = chan * tmp */
2885            micro_mul( &r[chan_index], &tmp, &r[chan_index] );
2886            STORE( &r[chan_index], 0, chan_index );
2887         }
2888      }
2889      break;
2890
2891   case TGSI_OPCODE_DIV:
2892      assert( 0 );
2893      break;
2894
2895   case TGSI_OPCODE_DP2:
2896      FETCH( &r[0], 0, CHAN_X );
2897      FETCH( &r[1], 1, CHAN_X );
2898      micro_mul( &r[0], &r[0], &r[1] );
2899
2900      FETCH( &r[1], 0, CHAN_Y );
2901      FETCH( &r[2], 1, CHAN_Y );
2902      micro_mul( &r[1], &r[1], &r[2] );
2903      micro_add( &r[0], &r[0], &r[1] );
2904
2905      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2906         STORE( &r[0], 0, chan_index );
2907      }
2908      break;
2909
2910   case TGSI_OPCODE_IF:
2911      /* push CondMask */
2912      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2913      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2914      FETCH( &r[0], 0, CHAN_X );
2915      /* update CondMask */
2916      if( ! r[0].u[0] ) {
2917         mach->CondMask &= ~0x1;
2918      }
2919      if( ! r[0].u[1] ) {
2920         mach->CondMask &= ~0x2;
2921      }
2922      if( ! r[0].u[2] ) {
2923         mach->CondMask &= ~0x4;
2924      }
2925      if( ! r[0].u[3] ) {
2926         mach->CondMask &= ~0x8;
2927      }
2928      UPDATE_EXEC_MASK(mach);
2929      /* Todo: If CondMask==0, jump to ELSE */
2930      break;
2931
2932   case TGSI_OPCODE_ELSE:
2933      /* invert CondMask wrt previous mask */
2934      {
2935         uint prevMask;
2936         assert(mach->CondStackTop > 0);
2937         prevMask = mach->CondStack[mach->CondStackTop - 1];
2938         mach->CondMask = ~mach->CondMask & prevMask;
2939         UPDATE_EXEC_MASK(mach);
2940         /* Todo: If CondMask==0, jump to ENDIF */
2941      }
2942      break;
2943
2944   case TGSI_OPCODE_ENDIF:
2945      /* pop CondMask */
2946      assert(mach->CondStackTop > 0);
2947      mach->CondMask = mach->CondStack[--mach->CondStackTop];
2948      UPDATE_EXEC_MASK(mach);
2949      break;
2950
2951   case TGSI_OPCODE_END:
2952      /* halt execution */
2953      *pc = -1;
2954      break;
2955
2956   case TGSI_OPCODE_REP:
2957      assert (0);
2958      break;
2959
2960   case TGSI_OPCODE_ENDREP:
2961       assert (0);
2962       break;
2963
2964   case TGSI_OPCODE_PUSHA:
2965      assert (0);
2966      break;
2967
2968   case TGSI_OPCODE_POPA:
2969      assert (0);
2970      break;
2971
2972   case TGSI_OPCODE_CEIL:
2973      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2974         FETCH( &r[0], 0, chan_index );
2975         micro_ceil(&d[chan_index], &r[0]);
2976      }
2977      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2978         STORE(&d[chan_index], 0, chan_index);
2979      }
2980      break;
2981
2982   case TGSI_OPCODE_I2F:
2983      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2984         FETCH( &r[0], 0, chan_index );
2985         micro_i2f(&d[chan_index], &r[0]);
2986      }
2987      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2988         STORE(&d[chan_index], 0, chan_index);
2989      }
2990      break;
2991
2992   case TGSI_OPCODE_NOT:
2993      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2994         FETCH( &r[0], 0, chan_index );
2995         micro_not(&d[chan_index], &r[0]);
2996      }
2997      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2998         STORE(&d[chan_index], 0, chan_index);
2999      }
3000      break;
3001
3002   case TGSI_OPCODE_TRUNC:
3003      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3004         FETCH( &r[0], 0, chan_index );
3005         micro_trunc(&d[chan_index], &r[0]);
3006      }
3007      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3008         STORE(&d[chan_index], 0, chan_index);
3009      }
3010      break;
3011
3012   case TGSI_OPCODE_SHL:
3013      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3014         FETCH( &r[0], 0, chan_index );
3015         FETCH( &r[1], 1, chan_index );
3016         micro_shl(&d[chan_index], &r[0], &r[1]);
3017      }
3018      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3019         STORE(&d[chan_index], 0, chan_index);
3020      }
3021      break;
3022
3023   case TGSI_OPCODE_SHR:
3024      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3025         FETCH( &r[0], 0, chan_index );
3026         FETCH( &r[1], 1, chan_index );
3027         micro_ishr(&d[chan_index], &r[0], &r[1]);
3028      }
3029      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3030         STORE(&d[chan_index], 0, chan_index);
3031      }
3032      break;
3033
3034   case TGSI_OPCODE_AND:
3035      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3036         FETCH( &r[0], 0, chan_index );
3037         FETCH( &r[1], 1, chan_index );
3038         micro_and(&d[chan_index], &r[0], &r[1]);
3039      }
3040      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3041         STORE(&d[chan_index], 0, chan_index);
3042      }
3043      break;
3044
3045   case TGSI_OPCODE_OR:
3046      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3047         FETCH( &r[0], 0, chan_index );
3048         FETCH( &r[1], 1, chan_index );
3049         micro_or(&d[chan_index], &r[0], &r[1]);
3050      }
3051      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3052         STORE(&d[chan_index], 0, chan_index);
3053      }
3054      break;
3055
3056   case TGSI_OPCODE_MOD:
3057      assert (0);
3058      break;
3059
3060   case TGSI_OPCODE_XOR:
3061      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3062         FETCH( &r[0], 0, chan_index );
3063         FETCH( &r[1], 1, chan_index );
3064         micro_xor(&d[chan_index], &r[0], &r[1]);
3065      }
3066      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3067         STORE(&d[chan_index], 0, chan_index);
3068      }
3069      break;
3070
3071   case TGSI_OPCODE_SAD:
3072      assert (0);
3073      break;
3074
3075   case TGSI_OPCODE_TXF:
3076      assert (0);
3077      break;
3078
3079   case TGSI_OPCODE_TXQ:
3080      assert (0);
3081      break;
3082
3083   case TGSI_OPCODE_EMIT:
3084      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
3085      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
3086      break;
3087
3088   case TGSI_OPCODE_ENDPRIM:
3089      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
3090      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
3091      break;
3092
3093   case TGSI_OPCODE_BGNFOR:
3094      assert(mach->LoopCounterStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3095      for (chan_index = 0; chan_index < 3; chan_index++) {
3096         FETCH( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[chan_index], 0, chan_index );
3097      }
3098      ++mach->LoopCounterStackTop;
3099      STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X], 0, CHAN_X);
3100      /* update LoopMask */
3101      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3102         mach->LoopMask &= ~0x1;
3103      }
3104      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3105         mach->LoopMask &= ~0x2;
3106      }
3107      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3108         mach->LoopMask &= ~0x4;
3109      }
3110      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3111         mach->LoopMask &= ~0x8;
3112      }
3113      /* TODO: if mach->LoopMask == 0, jump to end of loop */
3114      UPDATE_EXEC_MASK(mach);
3115      /* fall-through (for now) */
3116   case TGSI_OPCODE_BGNLOOP:
3117      /* push LoopMask and ContMasks */
3118      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3119      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3120      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3121      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3122      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3123      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3124      break;
3125
3126   case TGSI_OPCODE_ENDFOR:
3127      assert(mach->LoopCounterStackTop > 0);
3128      micro_sub(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3129                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3130                &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
3131      /* update LoopMask */
3132      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3133         mach->LoopMask &= ~0x1;
3134      }
3135      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3136         mach->LoopMask &= ~0x2;
3137      }
3138      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3139         mach->LoopMask &= ~0x4;
3140      }
3141      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3142         mach->LoopMask &= ~0x8;
3143      }
3144      micro_add(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3145                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3146                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Z]);
3147      assert(mach->LoopLabelStackTop > 0);
3148      inst = mach->Instructions + mach->LoopLabelStack[mach->LoopLabelStackTop - 1];
3149      STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_X], 0, CHAN_X);
3150      /* Restore ContMask, but don't pop */
3151      assert(mach->ContStackTop > 0);
3152      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3153      UPDATE_EXEC_MASK(mach);
3154      if (mach->ExecMask) {
3155         /* repeat loop: jump to instruction just past BGNLOOP */
3156         assert(mach->LoopLabelStackTop > 0);
3157         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3158      }
3159      else {
3160         /* exit loop: pop LoopMask */
3161         assert(mach->LoopStackTop > 0);
3162         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3163         /* pop ContMask */
3164         assert(mach->ContStackTop > 0);
3165         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3166         assert(mach->LoopLabelStackTop > 0);
3167         --mach->LoopLabelStackTop;
3168         assert(mach->LoopCounterStackTop > 0);
3169         --mach->LoopCounterStackTop;
3170      }
3171      UPDATE_EXEC_MASK(mach);
3172      break;
3173
3174   case TGSI_OPCODE_ENDLOOP:
3175      /* Restore ContMask, but don't pop */
3176      assert(mach->ContStackTop > 0);
3177      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3178      UPDATE_EXEC_MASK(mach);
3179      if (mach->ExecMask) {
3180         /* repeat loop: jump to instruction just past BGNLOOP */
3181         assert(mach->LoopLabelStackTop > 0);
3182         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3183      }
3184      else {
3185         /* exit loop: pop LoopMask */
3186         assert(mach->LoopStackTop > 0);
3187         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3188         /* pop ContMask */
3189         assert(mach->ContStackTop > 0);
3190         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3191         assert(mach->LoopLabelStackTop > 0);
3192         --mach->LoopLabelStackTop;
3193      }
3194      UPDATE_EXEC_MASK(mach);
3195      break;
3196
3197   case TGSI_OPCODE_BRK:
3198      /* turn off loop channels for each enabled exec channel */
3199      mach->LoopMask &= ~mach->ExecMask;
3200      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3201      UPDATE_EXEC_MASK(mach);
3202      break;
3203
3204   case TGSI_OPCODE_CONT:
3205      /* turn off cont channels for each enabled exec channel */
3206      mach->ContMask &= ~mach->ExecMask;
3207      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3208      UPDATE_EXEC_MASK(mach);
3209      break;
3210
3211   case TGSI_OPCODE_BGNSUB:
3212      /* no-op */
3213      break;
3214
3215   case TGSI_OPCODE_ENDSUB:
3216      /* no-op */
3217      break;
3218
3219   case TGSI_OPCODE_NOP:
3220      break;
3221
3222   default:
3223      assert( 0 );
3224   }
3225}
3226
3227#define DEBUG_EXECUTION 0
3228
3229
3230/**
3231 * Run TGSI interpreter.
3232 * \return bitmask of "alive" quad components
3233 */
3234uint
3235tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
3236{
3237   uint i;
3238   int pc = 0;
3239
3240   mach->CondMask = 0xf;
3241   mach->LoopMask = 0xf;
3242   mach->ContMask = 0xf;
3243   mach->FuncMask = 0xf;
3244   mach->ExecMask = 0xf;
3245
3246   assert(mach->CondStackTop == 0);
3247   assert(mach->LoopStackTop == 0);
3248   assert(mach->ContStackTop == 0);
3249   assert(mach->CallStackTop == 0);
3250
3251   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
3252   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
3253
3254   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
3255      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
3256      mach->Primitives[0] = 0;
3257   }
3258
3259   for (i = 0; i < QUAD_SIZE; i++) {
3260      mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
3261         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
3262         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
3263         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
3264         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
3265   }
3266
3267   /* execute declarations (interpolants) */
3268   for (i = 0; i < mach->NumDeclarations; i++) {
3269      exec_declaration( mach, mach->Declarations+i );
3270   }
3271
3272   {
3273#if DEBUG_EXECUTION
3274      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
3275      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
3276      uint inst = 1;
3277
3278      memcpy(temps, mach->Temps, sizeof(temps));
3279      memcpy(outputs, mach->Outputs, sizeof(outputs));
3280#endif
3281
3282      /* execute instructions, until pc is set to -1 */
3283      while (pc != -1) {
3284
3285#if DEBUG_EXECUTION
3286         uint i;
3287
3288         tgsi_dump_instruction(&mach->Instructions[pc], inst++);
3289#endif
3290
3291         assert(pc < (int) mach->NumInstructions);
3292         exec_instruction(mach, mach->Instructions + pc, &pc);
3293
3294#if DEBUG_EXECUTION
3295         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
3296            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
3297               uint j;
3298
3299               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
3300               debug_printf("TEMP[%2u] = ", i);
3301               for (j = 0; j < 4; j++) {
3302                  if (j > 0) {
3303                     debug_printf("           ");
3304                  }
3305                  debug_printf("(%6f, %6f, %6f, %6f)\n",
3306                               temps[i].xyzw[0].f[j],
3307                               temps[i].xyzw[1].f[j],
3308                               temps[i].xyzw[2].f[j],
3309                               temps[i].xyzw[3].f[j]);
3310               }
3311            }
3312         }
3313         for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
3314            if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
3315               uint j;
3316
3317               memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
3318               debug_printf("OUT[%2u] =  ", i);
3319               for (j = 0; j < 4; j++) {
3320                  if (j > 0) {
3321                     debug_printf("           ");
3322                  }
3323                  debug_printf("{%6f, %6f, %6f, %6f}\n",
3324                               outputs[i].xyzw[0].f[j],
3325                               outputs[i].xyzw[1].f[j],
3326                               outputs[i].xyzw[2].f[j],
3327                               outputs[i].xyzw[3].f[j]);
3328               }
3329            }
3330         }
3331#endif
3332      }
3333   }
3334
3335#if 0
3336   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
3337   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
3338      /*
3339       * Scale back depth component.
3340       */
3341      for (i = 0; i < 4; i++)
3342         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
3343   }
3344#endif
3345
3346   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3347}
3348