tgsi_exec.c revision 884007546c98b1779bf266ec5111b1e7e2b68b2e
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28/**
29 * TGSI interpreter/executor.
30 *
31 * Flow control information:
32 *
33 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
34 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
35 * care since a condition may be true for some quad components but false
36 * for other components.
37 *
38 * We basically execute all statements (even if they're in the part of
39 * an IF/ELSE clause that's "not taken") and use a special mask to
40 * control writing to destination registers.  This is the ExecMask.
41 * See store_dest().
42 *
43 * The ExecMask is computed from three other masks (CondMask, LoopMask and
44 * ContMask) which are controlled by the flow control instructions (namely:
45 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
46 *
47 *
48 * Authors:
49 *   Michal Krol
50 *   Brian Paul
51 */
52
53#include "pipe/p_compiler.h"
54#include "pipe/p_state.h"
55#include "pipe/p_shader_tokens.h"
56#include "tgsi/tgsi_dump.h"
57#include "tgsi/tgsi_parse.h"
58#include "tgsi/tgsi_util.h"
59#include "tgsi_exec.h"
60#include "util/u_memory.h"
61#include "util/u_math.h"
62
63#define FAST_MATH 1
64
65#define TILE_TOP_LEFT     0
66#define TILE_TOP_RIGHT    1
67#define TILE_BOTTOM_LEFT  2
68#define TILE_BOTTOM_RIGHT 3
69
70#define CHAN_X  0
71#define CHAN_Y  1
72#define CHAN_Z  2
73#define CHAN_W  3
74
75/*
76 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
77 */
78#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
79#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
80#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
81#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
82#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
83#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
84#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
85#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
86#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
87#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
88#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
89#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
90#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
91#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
92#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
93#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
94#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
95#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
96#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
97#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
98#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
99#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
100#define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
101#define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
102#define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
103#define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
104#define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
105#define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
106#define TEMP_R0            TGSI_EXEC_TEMP_R0
107#define TEMP_P0            TGSI_EXEC_TEMP_P0
108
109#define IS_CHANNEL_ENABLED(INST, CHAN)\
110   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
111
112#define IS_CHANNEL_ENABLED2(INST, CHAN)\
113   ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
114
115#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
116   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
117      if (IS_CHANNEL_ENABLED( INST, CHAN ))
118
119#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
120   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
121      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
122
123
124/** The execution mask depends on the conditional mask and the loop mask */
125#define UPDATE_EXEC_MASK(MACH) \
126      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
127
128
129static const union tgsi_exec_channel ZeroVec =
130   { { 0.0, 0.0, 0.0, 0.0 } };
131
132
133#ifdef DEBUG
134static void
135check_inf_or_nan(const union tgsi_exec_channel *chan)
136{
137   assert(!util_is_inf_or_nan(chan->f[0]));
138   assert(!util_is_inf_or_nan(chan->f[1]));
139   assert(!util_is_inf_or_nan(chan->f[2]));
140   assert(!util_is_inf_or_nan(chan->f[3]));
141}
142#endif
143
144
145#ifdef DEBUG
146static void
147print_chan(const char *msg, const union tgsi_exec_channel *chan)
148{
149   debug_printf("%s = {%f, %f, %f, %f}\n",
150                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
151}
152#endif
153
154
155#ifdef DEBUG
156static void
157print_temp(const struct tgsi_exec_machine *mach, uint index)
158{
159   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
160   int i;
161   debug_printf("Temp[%u] =\n", index);
162   for (i = 0; i < 4; i++) {
163      debug_printf("  %c: { %f, %f, %f, %f }\n",
164                   "XYZW"[i],
165                   tmp->xyzw[i].f[0],
166                   tmp->xyzw[i].f[1],
167                   tmp->xyzw[i].f[2],
168                   tmp->xyzw[i].f[3]);
169   }
170}
171#endif
172
173
174/**
175 * Check if there's a potential src/dst register data dependency when
176 * using SOA execution.
177 * Example:
178 *   MOV T, T.yxwz;
179 * This would expand into:
180 *   MOV t0, t1;
181 *   MOV t1, t0;
182 *   MOV t2, t3;
183 *   MOV t3, t2;
184 * The second instruction will have the wrong value for t0 if executed as-is.
185 */
186boolean
187tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
188{
189   uint i, chan;
190
191   uint writemask = inst->FullDstRegisters[0].DstRegister.WriteMask;
192   if (writemask == TGSI_WRITEMASK_X ||
193       writemask == TGSI_WRITEMASK_Y ||
194       writemask == TGSI_WRITEMASK_Z ||
195       writemask == TGSI_WRITEMASK_W ||
196       writemask == TGSI_WRITEMASK_NONE) {
197      /* no chance of data dependency */
198      return FALSE;
199   }
200
201   /* loop over src regs */
202   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
203      if ((inst->FullSrcRegisters[i].SrcRegister.File ==
204           inst->FullDstRegisters[0].DstRegister.File) &&
205          (inst->FullSrcRegisters[i].SrcRegister.Index ==
206           inst->FullDstRegisters[0].DstRegister.Index)) {
207         /* loop over dest channels */
208         uint channelsWritten = 0x0;
209         FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
210            /* check if we're reading a channel that's been written */
211            uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->FullSrcRegisters[i], chan);
212            if (channelsWritten & (1 << swizzle)) {
213               return TRUE;
214            }
215
216            channelsWritten |= (1 << chan);
217         }
218      }
219   }
220   return FALSE;
221}
222
223
224/**
225 * Initialize machine state by expanding tokens to full instructions,
226 * allocating temporary storage, setting up constants, etc.
227 * After this, we can call tgsi_exec_machine_run() many times.
228 */
229void
230tgsi_exec_machine_bind_shader(
231   struct tgsi_exec_machine *mach,
232   const struct tgsi_token *tokens,
233   uint numSamplers,
234   struct tgsi_sampler **samplers)
235{
236   uint k;
237   struct tgsi_parse_context parse;
238   struct tgsi_exec_labels *labels = &mach->Labels;
239   struct tgsi_full_instruction *instructions;
240   struct tgsi_full_declaration *declarations;
241   uint maxInstructions = 10, numInstructions = 0;
242   uint maxDeclarations = 10, numDeclarations = 0;
243   uint instno = 0;
244
245#if 0
246   tgsi_dump(tokens, 0);
247#endif
248
249   util_init_math();
250
251   mach->Tokens = tokens;
252   mach->Samplers = samplers;
253
254   k = tgsi_parse_init (&parse, mach->Tokens);
255   if (k != TGSI_PARSE_OK) {
256      debug_printf( "Problem parsing!\n" );
257      return;
258   }
259
260   mach->Processor = parse.FullHeader.Processor.Processor;
261   mach->ImmLimit = 0;
262   labels->count = 0;
263
264   declarations = (struct tgsi_full_declaration *)
265      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
266
267   if (!declarations) {
268      return;
269   }
270
271   instructions = (struct tgsi_full_instruction *)
272      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
273
274   if (!instructions) {
275      FREE( declarations );
276      return;
277   }
278
279   while( !tgsi_parse_end_of_tokens( &parse ) ) {
280      uint pointer = parse.Position;
281      uint i;
282
283      tgsi_parse_token( &parse );
284      switch( parse.FullToken.Token.Type ) {
285      case TGSI_TOKEN_TYPE_DECLARATION:
286         /* save expanded declaration */
287         if (numDeclarations == maxDeclarations) {
288            declarations = REALLOC(declarations,
289                                   maxDeclarations
290                                   * sizeof(struct tgsi_full_declaration),
291                                   (maxDeclarations + 10)
292                                   * sizeof(struct tgsi_full_declaration));
293            maxDeclarations += 10;
294         }
295         memcpy(declarations + numDeclarations,
296                &parse.FullToken.FullDeclaration,
297                sizeof(declarations[0]));
298         numDeclarations++;
299         break;
300
301      case TGSI_TOKEN_TYPE_IMMEDIATE:
302         {
303            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
304            assert( size <= 4 );
305            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
306
307            for( i = 0; i < size; i++ ) {
308               mach->Imms[mach->ImmLimit][i] =
309		  parse.FullToken.FullImmediate.u[i].Float;
310            }
311            mach->ImmLimit += 1;
312         }
313         break;
314
315      case TGSI_TOKEN_TYPE_INSTRUCTION:
316         assert( labels->count < MAX_LABELS );
317
318         labels->labels[labels->count][0] = instno;
319         labels->labels[labels->count][1] = pointer;
320         labels->count++;
321
322         /* save expanded instruction */
323         if (numInstructions == maxInstructions) {
324            instructions = REALLOC(instructions,
325                                   maxInstructions
326                                   * sizeof(struct tgsi_full_instruction),
327                                   (maxInstructions + 10)
328                                   * sizeof(struct tgsi_full_instruction));
329            maxInstructions += 10;
330         }
331
332         memcpy(instructions + numInstructions,
333                &parse.FullToken.FullInstruction,
334                sizeof(instructions[0]));
335
336         numInstructions++;
337         break;
338
339      default:
340         assert( 0 );
341      }
342   }
343   tgsi_parse_free (&parse);
344
345   if (mach->Declarations) {
346      FREE( mach->Declarations );
347   }
348   mach->Declarations = declarations;
349   mach->NumDeclarations = numDeclarations;
350
351   if (mach->Instructions) {
352      FREE( mach->Instructions );
353   }
354   mach->Instructions = instructions;
355   mach->NumInstructions = numInstructions;
356}
357
358
359struct tgsi_exec_machine *
360tgsi_exec_machine_create( void )
361{
362   struct tgsi_exec_machine *mach;
363   uint i;
364
365   mach = align_malloc( sizeof *mach, 16 );
366   if (!mach)
367      goto fail;
368
369   memset(mach, 0, sizeof(*mach));
370
371   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
372   mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
373
374   /* Setup constants. */
375   for( i = 0; i < 4; i++ ) {
376      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
377      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
378      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
379      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
380      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
381      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
382      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
383      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
384      mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
385      mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
386   }
387
388#ifdef DEBUG
389   /* silence warnings */
390   (void) print_chan;
391   (void) print_temp;
392#endif
393
394   return mach;
395
396fail:
397   align_free(mach);
398   return NULL;
399}
400
401
402void
403tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
404{
405   if (mach) {
406      FREE(mach->Instructions);
407      FREE(mach->Declarations);
408   }
409
410   align_free(mach);
411}
412
413
414static void
415micro_abs(
416   union tgsi_exec_channel *dst,
417   const union tgsi_exec_channel *src )
418{
419   dst->f[0] = fabsf( src->f[0] );
420   dst->f[1] = fabsf( src->f[1] );
421   dst->f[2] = fabsf( src->f[2] );
422   dst->f[3] = fabsf( src->f[3] );
423}
424
425static void
426micro_add(
427   union tgsi_exec_channel *dst,
428   const union tgsi_exec_channel *src0,
429   const union tgsi_exec_channel *src1 )
430{
431   dst->f[0] = src0->f[0] + src1->f[0];
432   dst->f[1] = src0->f[1] + src1->f[1];
433   dst->f[2] = src0->f[2] + src1->f[2];
434   dst->f[3] = src0->f[3] + src1->f[3];
435}
436
437#if 0
438static void
439micro_iadd(
440   union tgsi_exec_channel *dst,
441   const union tgsi_exec_channel *src0,
442   const union tgsi_exec_channel *src1 )
443{
444   dst->i[0] = src0->i[0] + src1->i[0];
445   dst->i[1] = src0->i[1] + src1->i[1];
446   dst->i[2] = src0->i[2] + src1->i[2];
447   dst->i[3] = src0->i[3] + src1->i[3];
448}
449#endif
450
451static void
452micro_and(
453   union tgsi_exec_channel *dst,
454   const union tgsi_exec_channel *src0,
455   const union tgsi_exec_channel *src1 )
456{
457   dst->u[0] = src0->u[0] & src1->u[0];
458   dst->u[1] = src0->u[1] & src1->u[1];
459   dst->u[2] = src0->u[2] & src1->u[2];
460   dst->u[3] = src0->u[3] & src1->u[3];
461}
462
463static void
464micro_ceil(
465   union tgsi_exec_channel *dst,
466   const union tgsi_exec_channel *src )
467{
468   dst->f[0] = ceilf( src->f[0] );
469   dst->f[1] = ceilf( src->f[1] );
470   dst->f[2] = ceilf( src->f[2] );
471   dst->f[3] = ceilf( src->f[3] );
472}
473
474static void
475micro_cos(
476   union tgsi_exec_channel *dst,
477   const union tgsi_exec_channel *src )
478{
479   dst->f[0] = cosf( src->f[0] );
480   dst->f[1] = cosf( src->f[1] );
481   dst->f[2] = cosf( src->f[2] );
482   dst->f[3] = cosf( src->f[3] );
483}
484
485static void
486micro_ddx(
487   union tgsi_exec_channel *dst,
488   const union tgsi_exec_channel *src )
489{
490   dst->f[0] =
491   dst->f[1] =
492   dst->f[2] =
493   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
494}
495
496static void
497micro_ddy(
498   union tgsi_exec_channel *dst,
499   const union tgsi_exec_channel *src )
500{
501   dst->f[0] =
502   dst->f[1] =
503   dst->f[2] =
504   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
505}
506
507static void
508micro_div(
509   union tgsi_exec_channel *dst,
510   const union tgsi_exec_channel *src0,
511   const union tgsi_exec_channel *src1 )
512{
513   if (src1->f[0] != 0) {
514      dst->f[0] = src0->f[0] / src1->f[0];
515   }
516   if (src1->f[1] != 0) {
517      dst->f[1] = src0->f[1] / src1->f[1];
518   }
519   if (src1->f[2] != 0) {
520      dst->f[2] = src0->f[2] / src1->f[2];
521   }
522   if (src1->f[3] != 0) {
523      dst->f[3] = src0->f[3] / src1->f[3];
524   }
525}
526
527#if 0
528static void
529micro_udiv(
530   union tgsi_exec_channel *dst,
531   const union tgsi_exec_channel *src0,
532   const union tgsi_exec_channel *src1 )
533{
534   dst->u[0] = src0->u[0] / src1->u[0];
535   dst->u[1] = src0->u[1] / src1->u[1];
536   dst->u[2] = src0->u[2] / src1->u[2];
537   dst->u[3] = src0->u[3] / src1->u[3];
538}
539#endif
540
541static void
542micro_eq(
543   union tgsi_exec_channel *dst,
544   const union tgsi_exec_channel *src0,
545   const union tgsi_exec_channel *src1,
546   const union tgsi_exec_channel *src2,
547   const union tgsi_exec_channel *src3 )
548{
549   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
550   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
551   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
552   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
553}
554
555#if 0
556static void
557micro_ieq(
558   union tgsi_exec_channel *dst,
559   const union tgsi_exec_channel *src0,
560   const union tgsi_exec_channel *src1,
561   const union tgsi_exec_channel *src2,
562   const union tgsi_exec_channel *src3 )
563{
564   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
565   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
566   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
567   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
568}
569#endif
570
571static void
572micro_exp2(
573   union tgsi_exec_channel *dst,
574   const union tgsi_exec_channel *src)
575{
576#if FAST_MATH
577   dst->f[0] = util_fast_exp2( src->f[0] );
578   dst->f[1] = util_fast_exp2( src->f[1] );
579   dst->f[2] = util_fast_exp2( src->f[2] );
580   dst->f[3] = util_fast_exp2( src->f[3] );
581#else
582
583#if DEBUG
584   /* Inf is okay for this instruction, so clamp it to silence assertions. */
585   uint i;
586   union tgsi_exec_channel clamped;
587
588   for (i = 0; i < 4; i++) {
589      if (src->f[i] > 127.99999f) {
590         clamped.f[i] = 127.99999f;
591      } else if (src->f[i] < -126.99999f) {
592         clamped.f[i] = -126.99999f;
593      } else {
594         clamped.f[i] = src->f[i];
595      }
596   }
597   src = &clamped;
598#endif
599
600   dst->f[0] = powf( 2.0f, src->f[0] );
601   dst->f[1] = powf( 2.0f, src->f[1] );
602   dst->f[2] = powf( 2.0f, src->f[2] );
603   dst->f[3] = powf( 2.0f, src->f[3] );
604#endif
605}
606
607#if 0
608static void
609micro_f2ut(
610   union tgsi_exec_channel *dst,
611   const union tgsi_exec_channel *src )
612{
613   dst->u[0] = (uint) src->f[0];
614   dst->u[1] = (uint) src->f[1];
615   dst->u[2] = (uint) src->f[2];
616   dst->u[3] = (uint) src->f[3];
617}
618#endif
619
620static void
621micro_float_clamp(union tgsi_exec_channel *dst,
622                  const union tgsi_exec_channel *src)
623{
624   uint i;
625
626   for (i = 0; i < 4; i++) {
627      if (src->f[i] > 0.0f) {
628         if (src->f[i] > 1.884467e+019f)
629            dst->f[i] = 1.884467e+019f;
630         else if (src->f[i] < 5.42101e-020f)
631            dst->f[i] = 5.42101e-020f;
632         else
633            dst->f[i] = src->f[i];
634      }
635      else {
636         if (src->f[i] < -1.884467e+019f)
637            dst->f[i] = -1.884467e+019f;
638         else if (src->f[i] > -5.42101e-020f)
639            dst->f[i] = -5.42101e-020f;
640         else
641            dst->f[i] = src->f[i];
642      }
643   }
644}
645
646static void
647micro_flr(
648   union tgsi_exec_channel *dst,
649   const union tgsi_exec_channel *src )
650{
651   dst->f[0] = floorf( src->f[0] );
652   dst->f[1] = floorf( src->f[1] );
653   dst->f[2] = floorf( src->f[2] );
654   dst->f[3] = floorf( src->f[3] );
655}
656
657static void
658micro_frc(
659   union tgsi_exec_channel *dst,
660   const union tgsi_exec_channel *src )
661{
662   dst->f[0] = src->f[0] - floorf( src->f[0] );
663   dst->f[1] = src->f[1] - floorf( src->f[1] );
664   dst->f[2] = src->f[2] - floorf( src->f[2] );
665   dst->f[3] = src->f[3] - floorf( src->f[3] );
666}
667
668static void
669micro_i2f(
670   union tgsi_exec_channel *dst,
671   const union tgsi_exec_channel *src )
672{
673   dst->f[0] = (float) src->i[0];
674   dst->f[1] = (float) src->i[1];
675   dst->f[2] = (float) src->i[2];
676   dst->f[3] = (float) src->i[3];
677}
678
679static void
680micro_lg2(
681   union tgsi_exec_channel *dst,
682   const union tgsi_exec_channel *src )
683{
684#if FAST_MATH
685   dst->f[0] = util_fast_log2( src->f[0] );
686   dst->f[1] = util_fast_log2( src->f[1] );
687   dst->f[2] = util_fast_log2( src->f[2] );
688   dst->f[3] = util_fast_log2( src->f[3] );
689#else
690   dst->f[0] = logf( src->f[0] ) * 1.442695f;
691   dst->f[1] = logf( src->f[1] ) * 1.442695f;
692   dst->f[2] = logf( src->f[2] ) * 1.442695f;
693   dst->f[3] = logf( src->f[3] ) * 1.442695f;
694#endif
695}
696
697static void
698micro_le(
699   union tgsi_exec_channel *dst,
700   const union tgsi_exec_channel *src0,
701   const union tgsi_exec_channel *src1,
702   const union tgsi_exec_channel *src2,
703   const union tgsi_exec_channel *src3 )
704{
705   dst->f[0] = src0->f[0] <= src1->f[0] ? src2->f[0] : src3->f[0];
706   dst->f[1] = src0->f[1] <= src1->f[1] ? src2->f[1] : src3->f[1];
707   dst->f[2] = src0->f[2] <= src1->f[2] ? src2->f[2] : src3->f[2];
708   dst->f[3] = src0->f[3] <= src1->f[3] ? src2->f[3] : src3->f[3];
709}
710
711static void
712micro_lt(
713   union tgsi_exec_channel *dst,
714   const union tgsi_exec_channel *src0,
715   const union tgsi_exec_channel *src1,
716   const union tgsi_exec_channel *src2,
717   const union tgsi_exec_channel *src3 )
718{
719   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
720   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
721   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
722   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
723}
724
725#if 0
726static void
727micro_ilt(
728   union tgsi_exec_channel *dst,
729   const union tgsi_exec_channel *src0,
730   const union tgsi_exec_channel *src1,
731   const union tgsi_exec_channel *src2,
732   const union tgsi_exec_channel *src3 )
733{
734   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
735   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
736   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
737   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
738}
739#endif
740
741#if 0
742static void
743micro_ult(
744   union tgsi_exec_channel *dst,
745   const union tgsi_exec_channel *src0,
746   const union tgsi_exec_channel *src1,
747   const union tgsi_exec_channel *src2,
748   const union tgsi_exec_channel *src3 )
749{
750   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
751   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
752   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
753   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
754}
755#endif
756
757static void
758micro_max(
759   union tgsi_exec_channel *dst,
760   const union tgsi_exec_channel *src0,
761   const union tgsi_exec_channel *src1 )
762{
763   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
764   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
765   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
766   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
767}
768
769#if 0
770static void
771micro_imax(
772   union tgsi_exec_channel *dst,
773   const union tgsi_exec_channel *src0,
774   const union tgsi_exec_channel *src1 )
775{
776   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
777   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
778   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
779   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
780}
781#endif
782
783#if 0
784static void
785micro_umax(
786   union tgsi_exec_channel *dst,
787   const union tgsi_exec_channel *src0,
788   const union tgsi_exec_channel *src1 )
789{
790   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
791   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
792   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
793   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
794}
795#endif
796
797static void
798micro_min(
799   union tgsi_exec_channel *dst,
800   const union tgsi_exec_channel *src0,
801   const union tgsi_exec_channel *src1 )
802{
803   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
804   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
805   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
806   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
807}
808
809#if 0
810static void
811micro_imin(
812   union tgsi_exec_channel *dst,
813   const union tgsi_exec_channel *src0,
814   const union tgsi_exec_channel *src1 )
815{
816   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
817   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
818   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
819   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
820}
821#endif
822
823#if 0
824static void
825micro_umin(
826   union tgsi_exec_channel *dst,
827   const union tgsi_exec_channel *src0,
828   const union tgsi_exec_channel *src1 )
829{
830   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
831   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
832   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
833   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
834}
835#endif
836
837#if 0
838static void
839micro_umod(
840   union tgsi_exec_channel *dst,
841   const union tgsi_exec_channel *src0,
842   const union tgsi_exec_channel *src1 )
843{
844   dst->u[0] = src0->u[0] % src1->u[0];
845   dst->u[1] = src0->u[1] % src1->u[1];
846   dst->u[2] = src0->u[2] % src1->u[2];
847   dst->u[3] = src0->u[3] % src1->u[3];
848}
849#endif
850
851static void
852micro_mul(
853   union tgsi_exec_channel *dst,
854   const union tgsi_exec_channel *src0,
855   const union tgsi_exec_channel *src1 )
856{
857   dst->f[0] = src0->f[0] * src1->f[0];
858   dst->f[1] = src0->f[1] * src1->f[1];
859   dst->f[2] = src0->f[2] * src1->f[2];
860   dst->f[3] = src0->f[3] * src1->f[3];
861}
862
863#if 0
864static void
865micro_imul(
866   union tgsi_exec_channel *dst,
867   const union tgsi_exec_channel *src0,
868   const union tgsi_exec_channel *src1 )
869{
870   dst->i[0] = src0->i[0] * src1->i[0];
871   dst->i[1] = src0->i[1] * src1->i[1];
872   dst->i[2] = src0->i[2] * src1->i[2];
873   dst->i[3] = src0->i[3] * src1->i[3];
874}
875#endif
876
877#if 0
878static void
879micro_imul64(
880   union tgsi_exec_channel *dst0,
881   union tgsi_exec_channel *dst1,
882   const union tgsi_exec_channel *src0,
883   const union tgsi_exec_channel *src1 )
884{
885   dst1->i[0] = src0->i[0] * src1->i[0];
886   dst1->i[1] = src0->i[1] * src1->i[1];
887   dst1->i[2] = src0->i[2] * src1->i[2];
888   dst1->i[3] = src0->i[3] * src1->i[3];
889   dst0->i[0] = 0;
890   dst0->i[1] = 0;
891   dst0->i[2] = 0;
892   dst0->i[3] = 0;
893}
894#endif
895
896#if 0
897static void
898micro_umul64(
899   union tgsi_exec_channel *dst0,
900   union tgsi_exec_channel *dst1,
901   const union tgsi_exec_channel *src0,
902   const union tgsi_exec_channel *src1 )
903{
904   dst1->u[0] = src0->u[0] * src1->u[0];
905   dst1->u[1] = src0->u[1] * src1->u[1];
906   dst1->u[2] = src0->u[2] * src1->u[2];
907   dst1->u[3] = src0->u[3] * src1->u[3];
908   dst0->u[0] = 0;
909   dst0->u[1] = 0;
910   dst0->u[2] = 0;
911   dst0->u[3] = 0;
912}
913#endif
914
915
916#if 0
917static void
918micro_movc(
919   union tgsi_exec_channel *dst,
920   const union tgsi_exec_channel *src0,
921   const union tgsi_exec_channel *src1,
922   const union tgsi_exec_channel *src2 )
923{
924   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
925   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
926   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
927   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
928}
929#endif
930
931static void
932micro_neg(
933   union tgsi_exec_channel *dst,
934   const union tgsi_exec_channel *src )
935{
936   dst->f[0] = -src->f[0];
937   dst->f[1] = -src->f[1];
938   dst->f[2] = -src->f[2];
939   dst->f[3] = -src->f[3];
940}
941
942#if 0
943static void
944micro_ineg(
945   union tgsi_exec_channel *dst,
946   const union tgsi_exec_channel *src )
947{
948   dst->i[0] = -src->i[0];
949   dst->i[1] = -src->i[1];
950   dst->i[2] = -src->i[2];
951   dst->i[3] = -src->i[3];
952}
953#endif
954
955static void
956micro_not(
957   union tgsi_exec_channel *dst,
958   const union tgsi_exec_channel *src )
959{
960   dst->u[0] = ~src->u[0];
961   dst->u[1] = ~src->u[1];
962   dst->u[2] = ~src->u[2];
963   dst->u[3] = ~src->u[3];
964}
965
966static void
967micro_or(
968   union tgsi_exec_channel *dst,
969   const union tgsi_exec_channel *src0,
970   const union tgsi_exec_channel *src1 )
971{
972   dst->u[0] = src0->u[0] | src1->u[0];
973   dst->u[1] = src0->u[1] | src1->u[1];
974   dst->u[2] = src0->u[2] | src1->u[2];
975   dst->u[3] = src0->u[3] | src1->u[3];
976}
977
978static void
979micro_pow(
980   union tgsi_exec_channel *dst,
981   const union tgsi_exec_channel *src0,
982   const union tgsi_exec_channel *src1 )
983{
984#if FAST_MATH
985   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
986   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
987   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
988   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
989#else
990   dst->f[0] = powf( src0->f[0], src1->f[0] );
991   dst->f[1] = powf( src0->f[1], src1->f[1] );
992   dst->f[2] = powf( src0->f[2], src1->f[2] );
993   dst->f[3] = powf( src0->f[3], src1->f[3] );
994#endif
995}
996
997static void
998micro_rnd(
999   union tgsi_exec_channel *dst,
1000   const union tgsi_exec_channel *src )
1001{
1002   dst->f[0] = floorf( src->f[0] + 0.5f );
1003   dst->f[1] = floorf( src->f[1] + 0.5f );
1004   dst->f[2] = floorf( src->f[2] + 0.5f );
1005   dst->f[3] = floorf( src->f[3] + 0.5f );
1006}
1007
1008static void
1009micro_sgn(
1010   union tgsi_exec_channel *dst,
1011   const union tgsi_exec_channel *src )
1012{
1013   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
1014   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
1015   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
1016   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
1017}
1018
1019static void
1020micro_shl(
1021   union tgsi_exec_channel *dst,
1022   const union tgsi_exec_channel *src0,
1023   const union tgsi_exec_channel *src1 )
1024{
1025   dst->i[0] = src0->i[0] << src1->i[0];
1026   dst->i[1] = src0->i[1] << src1->i[1];
1027   dst->i[2] = src0->i[2] << src1->i[2];
1028   dst->i[3] = src0->i[3] << src1->i[3];
1029}
1030
1031static void
1032micro_ishr(
1033   union tgsi_exec_channel *dst,
1034   const union tgsi_exec_channel *src0,
1035   const union tgsi_exec_channel *src1 )
1036{
1037   dst->i[0] = src0->i[0] >> src1->i[0];
1038   dst->i[1] = src0->i[1] >> src1->i[1];
1039   dst->i[2] = src0->i[2] >> src1->i[2];
1040   dst->i[3] = src0->i[3] >> src1->i[3];
1041}
1042
1043static void
1044micro_trunc(
1045   union tgsi_exec_channel *dst,
1046   const union tgsi_exec_channel *src0 )
1047{
1048   dst->f[0] = (float) (int) src0->f[0];
1049   dst->f[1] = (float) (int) src0->f[1];
1050   dst->f[2] = (float) (int) src0->f[2];
1051   dst->f[3] = (float) (int) src0->f[3];
1052}
1053
1054#if 0
1055static void
1056micro_ushr(
1057   union tgsi_exec_channel *dst,
1058   const union tgsi_exec_channel *src0,
1059   const union tgsi_exec_channel *src1 )
1060{
1061   dst->u[0] = src0->u[0] >> src1->u[0];
1062   dst->u[1] = src0->u[1] >> src1->u[1];
1063   dst->u[2] = src0->u[2] >> src1->u[2];
1064   dst->u[3] = src0->u[3] >> src1->u[3];
1065}
1066#endif
1067
1068static void
1069micro_sin(
1070   union tgsi_exec_channel *dst,
1071   const union tgsi_exec_channel *src )
1072{
1073   dst->f[0] = sinf( src->f[0] );
1074   dst->f[1] = sinf( src->f[1] );
1075   dst->f[2] = sinf( src->f[2] );
1076   dst->f[3] = sinf( src->f[3] );
1077}
1078
1079static void
1080micro_sqrt( union tgsi_exec_channel *dst,
1081            const union tgsi_exec_channel *src )
1082{
1083   dst->f[0] = sqrtf( src->f[0] );
1084   dst->f[1] = sqrtf( src->f[1] );
1085   dst->f[2] = sqrtf( src->f[2] );
1086   dst->f[3] = sqrtf( src->f[3] );
1087}
1088
1089static void
1090micro_sub(
1091   union tgsi_exec_channel *dst,
1092   const union tgsi_exec_channel *src0,
1093   const union tgsi_exec_channel *src1 )
1094{
1095   dst->f[0] = src0->f[0] - src1->f[0];
1096   dst->f[1] = src0->f[1] - src1->f[1];
1097   dst->f[2] = src0->f[2] - src1->f[2];
1098   dst->f[3] = src0->f[3] - src1->f[3];
1099}
1100
1101#if 0
1102static void
1103micro_u2f(
1104   union tgsi_exec_channel *dst,
1105   const union tgsi_exec_channel *src )
1106{
1107   dst->f[0] = (float) src->u[0];
1108   dst->f[1] = (float) src->u[1];
1109   dst->f[2] = (float) src->u[2];
1110   dst->f[3] = (float) src->u[3];
1111}
1112#endif
1113
1114static void
1115micro_xor(
1116   union tgsi_exec_channel *dst,
1117   const union tgsi_exec_channel *src0,
1118   const union tgsi_exec_channel *src1 )
1119{
1120   dst->u[0] = src0->u[0] ^ src1->u[0];
1121   dst->u[1] = src0->u[1] ^ src1->u[1];
1122   dst->u[2] = src0->u[2] ^ src1->u[2];
1123   dst->u[3] = src0->u[3] ^ src1->u[3];
1124}
1125
1126static void
1127fetch_src_file_channel(
1128   const struct tgsi_exec_machine *mach,
1129   const uint file,
1130   const uint swizzle,
1131   const union tgsi_exec_channel *index,
1132   union tgsi_exec_channel *chan )
1133{
1134   switch( swizzle ) {
1135   case TGSI_SWIZZLE_X:
1136   case TGSI_SWIZZLE_Y:
1137   case TGSI_SWIZZLE_Z:
1138   case TGSI_SWIZZLE_W:
1139      switch( file ) {
1140      case TGSI_FILE_CONSTANT:
1141         assert(mach->Consts);
1142         if (index->i[0] < 0)
1143            chan->f[0] = 0.0f;
1144         else
1145            chan->f[0] = mach->Consts[index->i[0]][swizzle];
1146         if (index->i[1] < 0)
1147            chan->f[1] = 0.0f;
1148         else
1149            chan->f[1] = mach->Consts[index->i[1]][swizzle];
1150         if (index->i[2] < 0)
1151            chan->f[2] = 0.0f;
1152         else
1153            chan->f[2] = mach->Consts[index->i[2]][swizzle];
1154         if (index->i[3] < 0)
1155            chan->f[3] = 0.0f;
1156         else
1157            chan->f[3] = mach->Consts[index->i[3]][swizzle];
1158         break;
1159
1160      case TGSI_FILE_INPUT:
1161         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
1162         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
1163         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
1164         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
1165         break;
1166
1167      case TGSI_FILE_TEMPORARY:
1168         assert(index->i[0] < TGSI_EXEC_NUM_TEMPS);
1169         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
1170         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
1171         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
1172         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
1173         break;
1174
1175      case TGSI_FILE_IMMEDIATE:
1176         assert( index->i[0] < (int) mach->ImmLimit );
1177         chan->f[0] = mach->Imms[index->i[0]][swizzle];
1178         assert( index->i[1] < (int) mach->ImmLimit );
1179         chan->f[1] = mach->Imms[index->i[1]][swizzle];
1180         assert( index->i[2] < (int) mach->ImmLimit );
1181         chan->f[2] = mach->Imms[index->i[2]][swizzle];
1182         assert( index->i[3] < (int) mach->ImmLimit );
1183         chan->f[3] = mach->Imms[index->i[3]][swizzle];
1184         break;
1185
1186      case TGSI_FILE_ADDRESS:
1187         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
1188         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
1189         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
1190         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
1191         break;
1192
1193      case TGSI_FILE_PREDICATE:
1194         assert(index->i[0] < TGSI_EXEC_NUM_PREDS);
1195         assert(index->i[1] < TGSI_EXEC_NUM_PREDS);
1196         assert(index->i[2] < TGSI_EXEC_NUM_PREDS);
1197         assert(index->i[3] < TGSI_EXEC_NUM_PREDS);
1198         chan->u[0] = mach->Predicates[0].xyzw[swizzle].u[0];
1199         chan->u[1] = mach->Predicates[0].xyzw[swizzle].u[1];
1200         chan->u[2] = mach->Predicates[0].xyzw[swizzle].u[2];
1201         chan->u[3] = mach->Predicates[0].xyzw[swizzle].u[3];
1202         break;
1203
1204      case TGSI_FILE_OUTPUT:
1205         /* vertex/fragment output vars can be read too */
1206         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
1207         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
1208         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
1209         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
1210         break;
1211
1212      default:
1213         assert( 0 );
1214      }
1215      break;
1216
1217   default:
1218      assert( 0 );
1219   }
1220}
1221
1222static void
1223fetch_source(
1224   const struct tgsi_exec_machine *mach,
1225   union tgsi_exec_channel *chan,
1226   const struct tgsi_full_src_register *reg,
1227   const uint chan_index )
1228{
1229   union tgsi_exec_channel index;
1230   uint swizzle;
1231
1232   /* We start with a direct index into a register file.
1233    *
1234    *    file[1],
1235    *    where:
1236    *       file = SrcRegister.File
1237    *       [1] = SrcRegister.Index
1238    */
1239   index.i[0] =
1240   index.i[1] =
1241   index.i[2] =
1242   index.i[3] = reg->SrcRegister.Index;
1243
1244   /* There is an extra source register that indirectly subscripts
1245    * a register file. The direct index now becomes an offset
1246    * that is being added to the indirect register.
1247    *
1248    *    file[ind[2].x+1],
1249    *    where:
1250    *       ind = SrcRegisterInd.File
1251    *       [2] = SrcRegisterInd.Index
1252    *       .x = SrcRegisterInd.SwizzleX
1253    */
1254   if (reg->SrcRegister.Indirect) {
1255      union tgsi_exec_channel index2;
1256      union tgsi_exec_channel indir_index;
1257      const uint execmask = mach->ExecMask;
1258      uint i;
1259
1260      /* which address register (always zero now) */
1261      index2.i[0] =
1262      index2.i[1] =
1263      index2.i[2] =
1264      index2.i[3] = reg->SrcRegisterInd.Index;
1265
1266      /* get current value of address register[swizzle] */
1267      swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterInd, CHAN_X );
1268      fetch_src_file_channel(
1269         mach,
1270         reg->SrcRegisterInd.File,
1271         swizzle,
1272         &index2,
1273         &indir_index );
1274
1275      /* add value of address register to the offset */
1276      index.i[0] += (int) indir_index.f[0];
1277      index.i[1] += (int) indir_index.f[1];
1278      index.i[2] += (int) indir_index.f[2];
1279      index.i[3] += (int) indir_index.f[3];
1280
1281      /* for disabled execution channels, zero-out the index to
1282       * avoid using a potential garbage value.
1283       */
1284      for (i = 0; i < QUAD_SIZE; i++) {
1285         if ((execmask & (1 << i)) == 0)
1286            index.i[i] = 0;
1287      }
1288   }
1289
1290   /* There is an extra source register that is a second
1291    * subscript to a register file. Effectively it means that
1292    * the register file is actually a 2D array of registers.
1293    *
1294    *    file[1][3] == file[1*sizeof(file[1])+3],
1295    *    where:
1296    *       [3] = SrcRegisterDim.Index
1297    */
1298   if (reg->SrcRegister.Dimension) {
1299      /* The size of the first-order array depends on the register file type.
1300       * We need to multiply the index to the first array to get an effective,
1301       * "flat" index that points to the beginning of the second-order array.
1302       */
1303      switch (reg->SrcRegister.File) {
1304      case TGSI_FILE_INPUT:
1305         index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1306         index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1307         index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1308         index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1309         break;
1310      case TGSI_FILE_CONSTANT:
1311         index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
1312         index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
1313         index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
1314         index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
1315         break;
1316      default:
1317         assert( 0 );
1318      }
1319
1320      index.i[0] += reg->SrcRegisterDim.Index;
1321      index.i[1] += reg->SrcRegisterDim.Index;
1322      index.i[2] += reg->SrcRegisterDim.Index;
1323      index.i[3] += reg->SrcRegisterDim.Index;
1324
1325      /* Again, the second subscript index can be addressed indirectly
1326       * identically to the first one.
1327       * Nothing stops us from indirectly addressing the indirect register,
1328       * but there is no need for that, so we won't exercise it.
1329       *
1330       *    file[1][ind[4].y+3],
1331       *    where:
1332       *       ind = SrcRegisterDimInd.File
1333       *       [4] = SrcRegisterDimInd.Index
1334       *       .y = SrcRegisterDimInd.SwizzleX
1335       */
1336      if (reg->SrcRegisterDim.Indirect) {
1337         union tgsi_exec_channel index2;
1338         union tgsi_exec_channel indir_index;
1339         const uint execmask = mach->ExecMask;
1340         uint i;
1341
1342         index2.i[0] =
1343         index2.i[1] =
1344         index2.i[2] =
1345         index2.i[3] = reg->SrcRegisterDimInd.Index;
1346
1347         swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
1348         fetch_src_file_channel(
1349            mach,
1350            reg->SrcRegisterDimInd.File,
1351            swizzle,
1352            &index2,
1353            &indir_index );
1354
1355         index.i[0] += (int) indir_index.f[0];
1356         index.i[1] += (int) indir_index.f[1];
1357         index.i[2] += (int) indir_index.f[2];
1358         index.i[3] += (int) indir_index.f[3];
1359
1360         /* for disabled execution channels, zero-out the index to
1361          * avoid using a potential garbage value.
1362          */
1363         for (i = 0; i < QUAD_SIZE; i++) {
1364            if ((execmask & (1 << i)) == 0)
1365               index.i[i] = 0;
1366         }
1367      }
1368
1369      /* If by any chance there was a need for a 3D array of register
1370       * files, we would have to check whether SrcRegisterDim is followed
1371       * by a dimension register and continue the saga.
1372       */
1373   }
1374
1375   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1376   fetch_src_file_channel(
1377      mach,
1378      reg->SrcRegister.File,
1379      swizzle,
1380      &index,
1381      chan );
1382
1383   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
1384   case TGSI_UTIL_SIGN_CLEAR:
1385      micro_abs( chan, chan );
1386      break;
1387
1388   case TGSI_UTIL_SIGN_SET:
1389      micro_abs( chan, chan );
1390      micro_neg( chan, chan );
1391      break;
1392
1393   case TGSI_UTIL_SIGN_TOGGLE:
1394      micro_neg( chan, chan );
1395      break;
1396
1397   case TGSI_UTIL_SIGN_KEEP:
1398      break;
1399   }
1400
1401   if (reg->SrcRegisterExtMod.Complement) {
1402      micro_sub( chan, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], chan );
1403   }
1404}
1405
1406static void
1407store_dest(
1408   struct tgsi_exec_machine *mach,
1409   const union tgsi_exec_channel *chan,
1410   const struct tgsi_full_dst_register *reg,
1411   const struct tgsi_full_instruction *inst,
1412   uint chan_index )
1413{
1414   uint i;
1415   union tgsi_exec_channel null;
1416   union tgsi_exec_channel *dst;
1417   uint execmask = mach->ExecMask;
1418   int offset = 0;  /* indirection offset */
1419   int index;
1420
1421#ifdef DEBUG
1422   check_inf_or_nan(chan);
1423#endif
1424
1425   /* There is an extra source register that indirectly subscripts
1426    * a register file. The direct index now becomes an offset
1427    * that is being added to the indirect register.
1428    *
1429    *    file[ind[2].x+1],
1430    *    where:
1431    *       ind = DstRegisterInd.File
1432    *       [2] = DstRegisterInd.Index
1433    *       .x = DstRegisterInd.SwizzleX
1434    */
1435   if (reg->DstRegister.Indirect) {
1436      union tgsi_exec_channel index;
1437      union tgsi_exec_channel indir_index;
1438      uint swizzle;
1439
1440      /* which address register (always zero for now) */
1441      index.i[0] =
1442      index.i[1] =
1443      index.i[2] =
1444      index.i[3] = reg->DstRegisterInd.Index;
1445
1446      /* get current value of address register[swizzle] */
1447      swizzle = tgsi_util_get_src_register_swizzle( &reg->DstRegisterInd, CHAN_X );
1448
1449      /* fetch values from the address/indirection register */
1450      fetch_src_file_channel(
1451         mach,
1452         reg->DstRegisterInd.File,
1453         swizzle,
1454         &index,
1455         &indir_index );
1456
1457      /* save indirection offset */
1458      offset = (int) indir_index.f[0];
1459   }
1460
1461   switch (reg->DstRegister.File) {
1462   case TGSI_FILE_NULL:
1463      dst = &null;
1464      break;
1465
1466   case TGSI_FILE_OUTPUT:
1467      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1468         + reg->DstRegister.Index;
1469      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1470      break;
1471
1472   case TGSI_FILE_TEMPORARY:
1473      index = reg->DstRegister.Index;
1474      assert( index < TGSI_EXEC_NUM_TEMPS );
1475      dst = &mach->Temps[offset + index].xyzw[chan_index];
1476      break;
1477
1478   case TGSI_FILE_ADDRESS:
1479      index = reg->DstRegister.Index;
1480      dst = &mach->Addrs[index].xyzw[chan_index];
1481      break;
1482
1483   case TGSI_FILE_LOOP:
1484      assert(reg->DstRegister.Index == 0);
1485      assert(mach->LoopCounterStackTop > 0);
1486      assert(chan_index == CHAN_X);
1487      dst = &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[chan_index];
1488      break;
1489
1490   case TGSI_FILE_PREDICATE:
1491      index = reg->DstRegister.Index;
1492      assert(index < TGSI_EXEC_NUM_PREDS);
1493      dst = &mach->Predicates[index].xyzw[chan_index];
1494      break;
1495
1496   default:
1497      assert( 0 );
1498      return;
1499   }
1500
1501   if (inst->Instruction.Predicate) {
1502      uint swizzle;
1503      union tgsi_exec_channel *pred;
1504
1505      switch (chan_index) {
1506      case CHAN_X:
1507         swizzle = inst->InstructionPredicate.SwizzleX;
1508         break;
1509      case CHAN_Y:
1510         swizzle = inst->InstructionPredicate.SwizzleY;
1511         break;
1512      case CHAN_Z:
1513         swizzle = inst->InstructionPredicate.SwizzleZ;
1514         break;
1515      case CHAN_W:
1516         swizzle = inst->InstructionPredicate.SwizzleW;
1517         break;
1518      default:
1519         assert(0);
1520         return;
1521      }
1522
1523      assert(inst->InstructionPredicate.Index == 0);
1524
1525      pred = &mach->Predicates[inst->InstructionPredicate.Index].xyzw[swizzle];
1526
1527      if (inst->InstructionPredicate.Negate) {
1528         for (i = 0; i < QUAD_SIZE; i++) {
1529            if (pred->u[i]) {
1530               execmask &= ~(1 << i);
1531            }
1532         }
1533      } else {
1534         for (i = 0; i < QUAD_SIZE; i++) {
1535            if (!pred->u[i]) {
1536               execmask &= ~(1 << i);
1537            }
1538         }
1539      }
1540   }
1541
1542   switch (inst->Instruction.Saturate) {
1543   case TGSI_SAT_NONE:
1544      for (i = 0; i < QUAD_SIZE; i++)
1545         if (execmask & (1 << i))
1546            dst->i[i] = chan->i[i];
1547      break;
1548
1549   case TGSI_SAT_ZERO_ONE:
1550      for (i = 0; i < QUAD_SIZE; i++)
1551         if (execmask & (1 << i)) {
1552            if (chan->f[i] < 0.0f)
1553               dst->f[i] = 0.0f;
1554            else if (chan->f[i] > 1.0f)
1555               dst->f[i] = 1.0f;
1556            else
1557               dst->i[i] = chan->i[i];
1558         }
1559      break;
1560
1561   case TGSI_SAT_MINUS_PLUS_ONE:
1562      for (i = 0; i < QUAD_SIZE; i++)
1563         if (execmask & (1 << i)) {
1564            if (chan->f[i] < -1.0f)
1565               dst->f[i] = -1.0f;
1566            else if (chan->f[i] > 1.0f)
1567               dst->f[i] = 1.0f;
1568            else
1569               dst->i[i] = chan->i[i];
1570         }
1571      break;
1572
1573   default:
1574      assert( 0 );
1575   }
1576}
1577
1578#define FETCH(VAL,INDEX,CHAN)\
1579    fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
1580
1581#define STORE(VAL,INDEX,CHAN)\
1582    store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
1583
1584
1585/**
1586 * Execute ARB-style KIL which is predicated by a src register.
1587 * Kill fragment if any of the four values is less than zero.
1588 */
1589static void
1590exec_kil(struct tgsi_exec_machine *mach,
1591         const struct tgsi_full_instruction *inst)
1592{
1593   uint uniquemask;
1594   uint chan_index;
1595   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1596   union tgsi_exec_channel r[1];
1597
1598   /* This mask stores component bits that were already tested. */
1599   uniquemask = 0;
1600
1601   for (chan_index = 0; chan_index < 4; chan_index++)
1602   {
1603      uint swizzle;
1604      uint i;
1605
1606      /* unswizzle channel */
1607      swizzle = tgsi_util_get_full_src_register_swizzle (
1608                        &inst->FullSrcRegisters[0],
1609                        chan_index);
1610
1611      /* check if the component has not been already tested */
1612      if (uniquemask & (1 << swizzle))
1613         continue;
1614      uniquemask |= 1 << swizzle;
1615
1616      FETCH(&r[0], 0, chan_index);
1617      for (i = 0; i < 4; i++)
1618         if (r[0].f[i] < 0.0f)
1619            kilmask |= 1 << i;
1620   }
1621
1622   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1623}
1624
1625/**
1626 * Execute NVIDIA-style KIL which is predicated by a condition code.
1627 * Kill fragment if the condition code is TRUE.
1628 */
1629static void
1630exec_kilp(struct tgsi_exec_machine *mach,
1631          const struct tgsi_full_instruction *inst)
1632{
1633   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1634
1635   /* "unconditional" kil */
1636   kilmask = mach->ExecMask;
1637   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1638}
1639
1640
1641/*
1642 * Fetch a four texture samples using STR texture coordinates.
1643 */
1644static void
1645fetch_texel( struct tgsi_sampler *sampler,
1646             const union tgsi_exec_channel *s,
1647             const union tgsi_exec_channel *t,
1648             const union tgsi_exec_channel *p,
1649             float lodbias,  /* XXX should be float[4] */
1650             union tgsi_exec_channel *r,
1651             union tgsi_exec_channel *g,
1652             union tgsi_exec_channel *b,
1653             union tgsi_exec_channel *a )
1654{
1655   uint j;
1656   float rgba[NUM_CHANNELS][QUAD_SIZE];
1657
1658   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
1659
1660   for (j = 0; j < 4; j++) {
1661      r->f[j] = rgba[0][j];
1662      g->f[j] = rgba[1][j];
1663      b->f[j] = rgba[2][j];
1664      a->f[j] = rgba[3][j];
1665   }
1666}
1667
1668
1669static void
1670exec_tex(struct tgsi_exec_machine *mach,
1671         const struct tgsi_full_instruction *inst,
1672         boolean biasLod,
1673         boolean projected)
1674{
1675   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
1676   union tgsi_exec_channel r[4];
1677   uint chan_index;
1678   float lodBias;
1679
1680   /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
1681
1682   switch (inst->InstructionExtTexture.Texture) {
1683   case TGSI_TEXTURE_1D:
1684   case TGSI_TEXTURE_SHADOW1D:
1685
1686      FETCH(&r[0], 0, CHAN_X);
1687
1688      if (projected) {
1689         FETCH(&r[1], 0, CHAN_W);
1690         micro_div( &r[0], &r[0], &r[1] );
1691      }
1692
1693      if (biasLod) {
1694         FETCH(&r[1], 0, CHAN_W);
1695         lodBias = r[2].f[0];
1696      }
1697      else
1698         lodBias = 0.0;
1699
1700      fetch_texel(mach->Samplers[unit],
1701                  &r[0], &ZeroVec, &ZeroVec, lodBias,  /* S, T, P, BIAS */
1702                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
1703      break;
1704
1705   case TGSI_TEXTURE_2D:
1706   case TGSI_TEXTURE_RECT:
1707   case TGSI_TEXTURE_SHADOW2D:
1708   case TGSI_TEXTURE_SHADOWRECT:
1709
1710      FETCH(&r[0], 0, CHAN_X);
1711      FETCH(&r[1], 0, CHAN_Y);
1712      FETCH(&r[2], 0, CHAN_Z);
1713
1714      if (projected) {
1715         FETCH(&r[3], 0, CHAN_W);
1716         micro_div( &r[0], &r[0], &r[3] );
1717         micro_div( &r[1], &r[1], &r[3] );
1718         micro_div( &r[2], &r[2], &r[3] );
1719      }
1720
1721      if (biasLod) {
1722         FETCH(&r[3], 0, CHAN_W);
1723         lodBias = r[3].f[0];
1724      }
1725      else
1726         lodBias = 0.0;
1727
1728      fetch_texel(mach->Samplers[unit],
1729                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
1730                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1731      break;
1732
1733   case TGSI_TEXTURE_3D:
1734   case TGSI_TEXTURE_CUBE:
1735
1736      FETCH(&r[0], 0, CHAN_X);
1737      FETCH(&r[1], 0, CHAN_Y);
1738      FETCH(&r[2], 0, CHAN_Z);
1739
1740      if (projected) {
1741         FETCH(&r[3], 0, CHAN_W);
1742         micro_div( &r[0], &r[0], &r[3] );
1743         micro_div( &r[1], &r[1], &r[3] );
1744         micro_div( &r[2], &r[2], &r[3] );
1745      }
1746
1747      if (biasLod) {
1748         FETCH(&r[3], 0, CHAN_W);
1749         lodBias = r[3].f[0];
1750      }
1751      else
1752         lodBias = 0.0;
1753
1754      fetch_texel(mach->Samplers[unit],
1755                  &r[0], &r[1], &r[2], lodBias,
1756                  &r[0], &r[1], &r[2], &r[3]);
1757      break;
1758
1759   default:
1760      assert (0);
1761   }
1762
1763   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1764      STORE( &r[chan_index], 0, chan_index );
1765   }
1766}
1767
1768static void
1769exec_txd(struct tgsi_exec_machine *mach,
1770         const struct tgsi_full_instruction *inst)
1771{
1772   const uint unit = inst->FullSrcRegisters[3].SrcRegister.Index;
1773   union tgsi_exec_channel r[4];
1774   uint chan_index;
1775
1776   /*
1777    * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1778    */
1779
1780   switch (inst->InstructionExtTexture.Texture) {
1781   case TGSI_TEXTURE_1D:
1782   case TGSI_TEXTURE_SHADOW1D:
1783
1784      FETCH(&r[0], 0, CHAN_X);
1785
1786      fetch_texel(mach->Samplers[unit],
1787                  &r[0], &ZeroVec, &ZeroVec, 0.0f,  /* S, T, P, BIAS */
1788                  &r[0], &r[1], &r[2], &r[3]);      /* R, G, B, A */
1789      break;
1790
1791   case TGSI_TEXTURE_2D:
1792   case TGSI_TEXTURE_RECT:
1793   case TGSI_TEXTURE_SHADOW2D:
1794   case TGSI_TEXTURE_SHADOWRECT:
1795
1796      FETCH(&r[0], 0, CHAN_X);
1797      FETCH(&r[1], 0, CHAN_Y);
1798      FETCH(&r[2], 0, CHAN_Z);
1799
1800      fetch_texel(mach->Samplers[unit],
1801                  &r[0], &r[1], &r[2], 0.0f,    /* inputs */
1802                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1803      break;
1804
1805   case TGSI_TEXTURE_3D:
1806   case TGSI_TEXTURE_CUBE:
1807
1808      FETCH(&r[0], 0, CHAN_X);
1809      FETCH(&r[1], 0, CHAN_Y);
1810      FETCH(&r[2], 0, CHAN_Z);
1811
1812      fetch_texel(mach->Samplers[unit],
1813                  &r[0], &r[1], &r[2], 0.0f,
1814                  &r[0], &r[1], &r[2], &r[3]);
1815      break;
1816
1817   default:
1818      assert(0);
1819   }
1820
1821   FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1822      STORE(&r[chan_index], 0, chan_index);
1823   }
1824}
1825
1826
1827/**
1828 * Evaluate a constant-valued coefficient at the position of the
1829 * current quad.
1830 */
1831static void
1832eval_constant_coef(
1833   struct tgsi_exec_machine *mach,
1834   unsigned attrib,
1835   unsigned chan )
1836{
1837   unsigned i;
1838
1839   for( i = 0; i < QUAD_SIZE; i++ ) {
1840      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1841   }
1842}
1843
1844/**
1845 * Evaluate a linear-valued coefficient at the position of the
1846 * current quad.
1847 */
1848static void
1849eval_linear_coef(
1850   struct tgsi_exec_machine *mach,
1851   unsigned attrib,
1852   unsigned chan )
1853{
1854   const float x = mach->QuadPos.xyzw[0].f[0];
1855   const float y = mach->QuadPos.xyzw[1].f[0];
1856   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1857   const float dady = mach->InterpCoefs[attrib].dady[chan];
1858   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1859   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1860   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1861   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1862   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1863}
1864
1865/**
1866 * Evaluate a perspective-valued coefficient at the position of the
1867 * current quad.
1868 */
1869static void
1870eval_perspective_coef(
1871   struct tgsi_exec_machine *mach,
1872   unsigned attrib,
1873   unsigned chan )
1874{
1875   const float x = mach->QuadPos.xyzw[0].f[0];
1876   const float y = mach->QuadPos.xyzw[1].f[0];
1877   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1878   const float dady = mach->InterpCoefs[attrib].dady[chan];
1879   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1880   const float *w = mach->QuadPos.xyzw[3].f;
1881   /* divide by W here */
1882   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1883   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1884   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1885   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1886}
1887
1888
1889typedef void (* eval_coef_func)(
1890   struct tgsi_exec_machine *mach,
1891   unsigned attrib,
1892   unsigned chan );
1893
1894static void
1895exec_declaration(struct tgsi_exec_machine *mach,
1896                 const struct tgsi_full_declaration *decl)
1897{
1898   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
1899      if (decl->Declaration.File == TGSI_FILE_INPUT) {
1900         uint first, last, mask;
1901
1902         first = decl->DeclarationRange.First;
1903         last = decl->DeclarationRange.Last;
1904         mask = decl->Declaration.UsageMask;
1905
1906         if (decl->Semantic.SemanticName == TGSI_SEMANTIC_POSITION) {
1907            assert(decl->Semantic.SemanticIndex == 0);
1908            assert(first == last);
1909            assert(mask = TGSI_WRITEMASK_XYZW);
1910
1911            mach->Inputs[first] = mach->QuadPos;
1912         } else if (decl->Semantic.SemanticName == TGSI_SEMANTIC_FACE) {
1913            uint i;
1914
1915            assert(decl->Semantic.SemanticIndex == 0);
1916            assert(first == last);
1917
1918            for (i = 0; i < QUAD_SIZE; i++) {
1919               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
1920            }
1921         } else {
1922            eval_coef_func eval;
1923            uint i, j;
1924
1925            switch (decl->Declaration.Interpolate) {
1926            case TGSI_INTERPOLATE_CONSTANT:
1927               eval = eval_constant_coef;
1928               break;
1929
1930            case TGSI_INTERPOLATE_LINEAR:
1931               eval = eval_linear_coef;
1932               break;
1933
1934            case TGSI_INTERPOLATE_PERSPECTIVE:
1935               eval = eval_perspective_coef;
1936               break;
1937
1938            default:
1939               assert(0);
1940               return;
1941            }
1942
1943            for (j = 0; j < NUM_CHANNELS; j++) {
1944               if (mask & (1 << j)) {
1945                  for (i = first; i <= last; i++) {
1946                     eval(mach, i, j);
1947                  }
1948               }
1949            }
1950         }
1951      }
1952   }
1953}
1954
1955static void
1956exec_instruction(
1957   struct tgsi_exec_machine *mach,
1958   const struct tgsi_full_instruction *inst,
1959   int *pc )
1960{
1961   uint chan_index;
1962   union tgsi_exec_channel r[10];
1963   union tgsi_exec_channel d[8];
1964
1965   (*pc)++;
1966
1967   switch (inst->Instruction.Opcode) {
1968   case TGSI_OPCODE_ARL:
1969   case TGSI_OPCODE_FLR:
1970      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1971         FETCH( &r[0], 0, chan_index );
1972         micro_flr(&d[chan_index], &r[0]);
1973      }
1974      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1975         STORE(&d[chan_index], 0, chan_index);
1976      }
1977      break;
1978
1979   case TGSI_OPCODE_MOV:
1980      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1981         FETCH(&d[chan_index], 0, chan_index);
1982      }
1983      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1984         STORE(&d[chan_index], 0, chan_index);
1985      }
1986      break;
1987
1988   case TGSI_OPCODE_LIT:
1989      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1990         FETCH( &r[0], 0, CHAN_X );
1991         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1992            micro_max(&d[CHAN_Y], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
1993         }
1994
1995         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1996            FETCH( &r[1], 0, CHAN_Y );
1997            micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1998
1999            FETCH( &r[2], 0, CHAN_W );
2000            micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
2001            micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
2002            micro_pow( &r[1], &r[1], &r[2] );
2003            micro_lt(&d[CHAN_Z], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2004         }
2005
2006         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2007            STORE(&d[CHAN_Y], 0, CHAN_Y);
2008         }
2009         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2010            STORE(&d[CHAN_Z], 0, CHAN_Z);
2011         }
2012      }
2013      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2014         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2015      }
2016      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2017         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2018      }
2019      break;
2020
2021   case TGSI_OPCODE_RCP:
2022   /* TGSI_OPCODE_RECIP */
2023      FETCH( &r[0], 0, CHAN_X );
2024      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
2025      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2026         STORE( &r[0], 0, chan_index );
2027      }
2028      break;
2029
2030   case TGSI_OPCODE_RSQ:
2031   /* TGSI_OPCODE_RECIPSQRT */
2032      FETCH( &r[0], 0, CHAN_X );
2033      micro_abs( &r[0], &r[0] );
2034      micro_sqrt( &r[0], &r[0] );
2035      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
2036      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2037         STORE( &r[0], 0, chan_index );
2038      }
2039      break;
2040
2041   case TGSI_OPCODE_EXP:
2042      FETCH( &r[0], 0, CHAN_X );
2043      micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
2044      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2045         micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
2046         STORE( &r[2], 0, CHAN_X );        /* store r2 */
2047      }
2048      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2049         micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
2050         STORE( &r[2], 0, CHAN_Y );        /* store r2 */
2051      }
2052      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2053         micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
2054         STORE( &r[2], 0, CHAN_Z );        /* store r2 */
2055      }
2056      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2057         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2058      }
2059      break;
2060
2061   case TGSI_OPCODE_LOG:
2062      FETCH( &r[0], 0, CHAN_X );
2063      micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
2064      micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
2065      micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
2066      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2067         STORE( &r[0], 0, CHAN_X );
2068      }
2069      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2070         micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
2071         micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
2072         STORE( &r[0], 0, CHAN_Y );
2073      }
2074      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2075         STORE( &r[1], 0, CHAN_Z );
2076      }
2077      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2078         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2079      }
2080      break;
2081
2082   case TGSI_OPCODE_MUL:
2083      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2084         FETCH(&r[0], 0, chan_index);
2085         FETCH(&r[1], 1, chan_index);
2086         micro_mul(&d[chan_index], &r[0], &r[1]);
2087      }
2088      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2089         STORE(&d[chan_index], 0, chan_index);
2090      }
2091      break;
2092
2093   case TGSI_OPCODE_ADD:
2094      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2095         FETCH( &r[0], 0, chan_index );
2096         FETCH( &r[1], 1, chan_index );
2097         micro_add(&d[chan_index], &r[0], &r[1]);
2098      }
2099      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2100         STORE(&d[chan_index], 0, chan_index);
2101      }
2102      break;
2103
2104   case TGSI_OPCODE_DP3:
2105   /* TGSI_OPCODE_DOT3 */
2106      FETCH( &r[0], 0, CHAN_X );
2107      FETCH( &r[1], 1, CHAN_X );
2108      micro_mul( &r[0], &r[0], &r[1] );
2109
2110      FETCH( &r[1], 0, CHAN_Y );
2111      FETCH( &r[2], 1, CHAN_Y );
2112      micro_mul( &r[1], &r[1], &r[2] );
2113      micro_add( &r[0], &r[0], &r[1] );
2114
2115      FETCH( &r[1], 0, CHAN_Z );
2116      FETCH( &r[2], 1, CHAN_Z );
2117      micro_mul( &r[1], &r[1], &r[2] );
2118      micro_add( &r[0], &r[0], &r[1] );
2119
2120      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2121         STORE( &r[0], 0, chan_index );
2122      }
2123      break;
2124
2125    case TGSI_OPCODE_DP4:
2126    /* TGSI_OPCODE_DOT4 */
2127       FETCH(&r[0], 0, CHAN_X);
2128       FETCH(&r[1], 1, CHAN_X);
2129
2130       micro_mul( &r[0], &r[0], &r[1] );
2131
2132       FETCH(&r[1], 0, CHAN_Y);
2133       FETCH(&r[2], 1, CHAN_Y);
2134
2135       micro_mul( &r[1], &r[1], &r[2] );
2136       micro_add( &r[0], &r[0], &r[1] );
2137
2138       FETCH(&r[1], 0, CHAN_Z);
2139       FETCH(&r[2], 1, CHAN_Z);
2140
2141       micro_mul( &r[1], &r[1], &r[2] );
2142       micro_add( &r[0], &r[0], &r[1] );
2143
2144       FETCH(&r[1], 0, CHAN_W);
2145       FETCH(&r[2], 1, CHAN_W);
2146
2147       micro_mul( &r[1], &r[1], &r[2] );
2148       micro_add( &r[0], &r[0], &r[1] );
2149
2150      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2151         STORE( &r[0], 0, chan_index );
2152      }
2153      break;
2154
2155   case TGSI_OPCODE_DST:
2156      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2157         FETCH( &r[0], 0, CHAN_Y );
2158         FETCH( &r[1], 1, CHAN_Y);
2159         micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2160      }
2161      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2162         FETCH(&d[CHAN_Z], 0, CHAN_Z);
2163      }
2164      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2165         FETCH(&d[CHAN_W], 1, CHAN_W);
2166      }
2167
2168      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2169         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X);
2170      }
2171      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2172         STORE(&d[CHAN_Y], 0, CHAN_Y);
2173      }
2174      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2175         STORE(&d[CHAN_Z], 0, CHAN_Z);
2176      }
2177      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2178         STORE(&d[CHAN_W], 0, CHAN_W);
2179      }
2180      break;
2181
2182   case TGSI_OPCODE_MIN:
2183      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2184         FETCH(&r[0], 0, chan_index);
2185         FETCH(&r[1], 1, chan_index);
2186
2187         /* XXX use micro_min()?? */
2188         micro_lt(&d[chan_index], &r[0], &r[1], &r[0], &r[1]);
2189      }
2190      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2191         STORE(&d[chan_index], 0, chan_index);
2192      }
2193      break;
2194
2195   case TGSI_OPCODE_MAX:
2196      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2197         FETCH(&r[0], 0, chan_index);
2198         FETCH(&r[1], 1, chan_index);
2199
2200         /* XXX use micro_max()?? */
2201         micro_lt(&d[chan_index], &r[0], &r[1], &r[1], &r[0] );
2202      }
2203      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2204         STORE(&d[chan_index], 0, chan_index);
2205      }
2206      break;
2207
2208   case TGSI_OPCODE_SLT:
2209   /* TGSI_OPCODE_SETLT */
2210      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2211         FETCH( &r[0], 0, chan_index );
2212         FETCH( &r[1], 1, chan_index );
2213         micro_lt(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2214      }
2215      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2216         STORE(&d[chan_index], 0, chan_index);
2217      }
2218      break;
2219
2220   case TGSI_OPCODE_SGE:
2221   /* TGSI_OPCODE_SETGE */
2222      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2223         FETCH( &r[0], 0, chan_index );
2224         FETCH( &r[1], 1, chan_index );
2225         micro_le(&d[chan_index], &r[1], &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2226      }
2227      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2228         STORE(&d[chan_index], 0, chan_index);
2229      }
2230      break;
2231
2232   case TGSI_OPCODE_MAD:
2233   /* TGSI_OPCODE_MADD */
2234      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2235         FETCH( &r[0], 0, chan_index );
2236         FETCH( &r[1], 1, chan_index );
2237         micro_mul( &r[0], &r[0], &r[1] );
2238         FETCH( &r[1], 2, chan_index );
2239         micro_add(&d[chan_index], &r[0], &r[1]);
2240      }
2241      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2242         STORE(&d[chan_index], 0, chan_index);
2243      }
2244      break;
2245
2246   case TGSI_OPCODE_SUB:
2247      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2248         FETCH(&r[0], 0, chan_index);
2249         FETCH(&r[1], 1, chan_index);
2250         micro_sub(&d[chan_index], &r[0], &r[1]);
2251      }
2252      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2253         STORE(&d[chan_index], 0, chan_index);
2254      }
2255      break;
2256
2257   case TGSI_OPCODE_LRP:
2258      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2259         FETCH(&r[0], 0, chan_index);
2260         FETCH(&r[1], 1, chan_index);
2261         FETCH(&r[2], 2, chan_index);
2262         micro_sub( &r[1], &r[1], &r[2] );
2263         micro_mul( &r[0], &r[0], &r[1] );
2264         micro_add(&d[chan_index], &r[0], &r[2]);
2265      }
2266      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2267         STORE(&d[chan_index], 0, chan_index);
2268      }
2269      break;
2270
2271   case TGSI_OPCODE_CND:
2272      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2273         FETCH(&r[0], 0, chan_index);
2274         FETCH(&r[1], 1, chan_index);
2275         FETCH(&r[2], 2, chan_index);
2276         micro_lt(&d[chan_index], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]);
2277      }
2278      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2279         STORE(&d[chan_index], 0, chan_index);
2280      }
2281      break;
2282
2283   case TGSI_OPCODE_DP2A:
2284      FETCH( &r[0], 0, CHAN_X );
2285      FETCH( &r[1], 1, CHAN_X );
2286      micro_mul( &r[0], &r[0], &r[1] );
2287
2288      FETCH( &r[1], 0, CHAN_Y );
2289      FETCH( &r[2], 1, CHAN_Y );
2290      micro_mul( &r[1], &r[1], &r[2] );
2291      micro_add( &r[0], &r[0], &r[1] );
2292
2293      FETCH( &r[2], 2, CHAN_X );
2294      micro_add( &r[0], &r[0], &r[2] );
2295
2296      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2297         STORE( &r[0], 0, chan_index );
2298      }
2299      break;
2300
2301   case TGSI_OPCODE_FRC:
2302      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2303         FETCH( &r[0], 0, chan_index );
2304         micro_frc(&d[chan_index], &r[0]);
2305      }
2306      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2307         STORE(&d[chan_index], 0, chan_index);
2308      }
2309      break;
2310
2311   case TGSI_OPCODE_CLAMP:
2312      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2313         FETCH(&r[0], 0, chan_index);
2314         FETCH(&r[1], 1, chan_index);
2315         micro_max(&r[0], &r[0], &r[1]);
2316         FETCH(&r[1], 2, chan_index);
2317         micro_min(&d[chan_index], &r[0], &r[1]);
2318      }
2319      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2320         STORE(&d[chan_index], 0, chan_index);
2321      }
2322      break;
2323
2324   case TGSI_OPCODE_ROUND:
2325   case TGSI_OPCODE_ARR:
2326      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2327         FETCH( &r[0], 0, chan_index );
2328         micro_rnd(&d[chan_index], &r[0]);
2329      }
2330      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2331         STORE(&d[chan_index], 0, chan_index);
2332      }
2333      break;
2334
2335   case TGSI_OPCODE_EX2:
2336      FETCH(&r[0], 0, CHAN_X);
2337
2338      micro_exp2( &r[0], &r[0] );
2339
2340      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2341         STORE( &r[0], 0, chan_index );
2342      }
2343      break;
2344
2345   case TGSI_OPCODE_LG2:
2346      FETCH( &r[0], 0, CHAN_X );
2347      micro_lg2( &r[0], &r[0] );
2348      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2349         STORE( &r[0], 0, chan_index );
2350      }
2351      break;
2352
2353   case TGSI_OPCODE_POW:
2354      FETCH(&r[0], 0, CHAN_X);
2355      FETCH(&r[1], 1, CHAN_X);
2356
2357      micro_pow( &r[0], &r[0], &r[1] );
2358
2359      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2360         STORE( &r[0], 0, chan_index );
2361      }
2362      break;
2363
2364   case TGSI_OPCODE_XPD:
2365      FETCH(&r[0], 0, CHAN_Y);
2366      FETCH(&r[1], 1, CHAN_Z);
2367
2368      micro_mul( &r[2], &r[0], &r[1] );
2369
2370      FETCH(&r[3], 0, CHAN_Z);
2371      FETCH(&r[4], 1, CHAN_Y);
2372
2373      micro_mul( &r[5], &r[3], &r[4] );
2374      micro_sub(&d[CHAN_X], &r[2], &r[5]);
2375
2376      FETCH(&r[2], 1, CHAN_X);
2377
2378      micro_mul( &r[3], &r[3], &r[2] );
2379
2380      FETCH(&r[5], 0, CHAN_X);
2381
2382      micro_mul( &r[1], &r[1], &r[5] );
2383      micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2384
2385      micro_mul( &r[5], &r[5], &r[4] );
2386      micro_mul( &r[0], &r[0], &r[2] );
2387      micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2388
2389      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2390         STORE(&d[CHAN_X], 0, CHAN_X);
2391      }
2392      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2393         STORE(&d[CHAN_Y], 0, CHAN_Y);
2394      }
2395      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2396         STORE(&d[CHAN_Z], 0, CHAN_Z);
2397      }
2398      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2399         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2400      }
2401      break;
2402
2403    case TGSI_OPCODE_ABS:
2404       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2405          FETCH(&r[0], 0, chan_index);
2406          micro_abs(&d[chan_index], &r[0]);
2407       }
2408       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2409         STORE(&d[chan_index], 0, chan_index);
2410      }
2411       break;
2412
2413   case TGSI_OPCODE_RCC:
2414      FETCH(&r[0], 0, CHAN_X);
2415      micro_div(&r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0]);
2416      micro_float_clamp(&r[0], &r[0]);
2417      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2418         STORE(&r[0], 0, chan_index);
2419      }
2420      break;
2421
2422   case TGSI_OPCODE_DPH:
2423      FETCH(&r[0], 0, CHAN_X);
2424      FETCH(&r[1], 1, CHAN_X);
2425
2426      micro_mul( &r[0], &r[0], &r[1] );
2427
2428      FETCH(&r[1], 0, CHAN_Y);
2429      FETCH(&r[2], 1, CHAN_Y);
2430
2431      micro_mul( &r[1], &r[1], &r[2] );
2432      micro_add( &r[0], &r[0], &r[1] );
2433
2434      FETCH(&r[1], 0, CHAN_Z);
2435      FETCH(&r[2], 1, CHAN_Z);
2436
2437      micro_mul( &r[1], &r[1], &r[2] );
2438      micro_add( &r[0], &r[0], &r[1] );
2439
2440      FETCH(&r[1], 1, CHAN_W);
2441
2442      micro_add( &r[0], &r[0], &r[1] );
2443
2444      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2445         STORE( &r[0], 0, chan_index );
2446      }
2447      break;
2448
2449   case TGSI_OPCODE_COS:
2450      FETCH(&r[0], 0, CHAN_X);
2451
2452      micro_cos( &r[0], &r[0] );
2453
2454      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2455         STORE( &r[0], 0, chan_index );
2456      }
2457      break;
2458
2459   case TGSI_OPCODE_DDX:
2460      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2461         FETCH( &r[0], 0, chan_index );
2462         micro_ddx(&d[chan_index], &r[0]);
2463      }
2464      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2465         STORE(&d[chan_index], 0, chan_index);
2466      }
2467      break;
2468
2469   case TGSI_OPCODE_DDY:
2470      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2471         FETCH( &r[0], 0, chan_index );
2472         micro_ddy(&d[chan_index], &r[0]);
2473      }
2474      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2475         STORE(&d[chan_index], 0, chan_index);
2476      }
2477      break;
2478
2479   case TGSI_OPCODE_KILP:
2480      exec_kilp (mach, inst);
2481      break;
2482
2483   case TGSI_OPCODE_KIL:
2484      exec_kil (mach, inst);
2485      break;
2486
2487   case TGSI_OPCODE_PK2H:
2488      assert (0);
2489      break;
2490
2491   case TGSI_OPCODE_PK2US:
2492      assert (0);
2493      break;
2494
2495   case TGSI_OPCODE_PK4B:
2496      assert (0);
2497      break;
2498
2499   case TGSI_OPCODE_PK4UB:
2500      assert (0);
2501      break;
2502
2503   case TGSI_OPCODE_RFL:
2504      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2505          IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2506          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2507         /* r0 = dp3(src0, src0) */
2508         FETCH(&r[2], 0, CHAN_X);
2509         micro_mul(&r[0], &r[2], &r[2]);
2510         FETCH(&r[4], 0, CHAN_Y);
2511         micro_mul(&r[8], &r[4], &r[4]);
2512         micro_add(&r[0], &r[0], &r[8]);
2513         FETCH(&r[6], 0, CHAN_Z);
2514         micro_mul(&r[8], &r[6], &r[6]);
2515         micro_add(&r[0], &r[0], &r[8]);
2516
2517         /* r1 = dp3(src0, src1) */
2518         FETCH(&r[3], 1, CHAN_X);
2519         micro_mul(&r[1], &r[2], &r[3]);
2520         FETCH(&r[5], 1, CHAN_Y);
2521         micro_mul(&r[8], &r[4], &r[5]);
2522         micro_add(&r[1], &r[1], &r[8]);
2523         FETCH(&r[7], 1, CHAN_Z);
2524         micro_mul(&r[8], &r[6], &r[7]);
2525         micro_add(&r[1], &r[1], &r[8]);
2526
2527         /* r1 = 2 * r1 / r0 */
2528         micro_add(&r[1], &r[1], &r[1]);
2529         micro_div(&r[1], &r[1], &r[0]);
2530
2531         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2532            micro_mul(&r[2], &r[2], &r[1]);
2533            micro_sub(&r[2], &r[2], &r[3]);
2534            STORE(&r[2], 0, CHAN_X);
2535         }
2536         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2537            micro_mul(&r[4], &r[4], &r[1]);
2538            micro_sub(&r[4], &r[4], &r[5]);
2539            STORE(&r[4], 0, CHAN_Y);
2540         }
2541         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2542            micro_mul(&r[6], &r[6], &r[1]);
2543            micro_sub(&r[6], &r[6], &r[7]);
2544            STORE(&r[6], 0, CHAN_Z);
2545         }
2546      }
2547      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2548         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2549      }
2550      break;
2551
2552   case TGSI_OPCODE_SEQ:
2553      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2554         FETCH( &r[0], 0, chan_index );
2555         FETCH( &r[1], 1, chan_index );
2556         micro_eq(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2557      }
2558      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2559         STORE(&d[chan_index], 0, chan_index);
2560      }
2561      break;
2562
2563   case TGSI_OPCODE_SFL:
2564      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2565         STORE(&mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, chan_index);
2566      }
2567      break;
2568
2569   case TGSI_OPCODE_SGT:
2570      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2571         FETCH( &r[0], 0, chan_index );
2572         FETCH( &r[1], 1, chan_index );
2573         micro_le(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
2574      }
2575      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2576         STORE(&d[chan_index], 0, chan_index);
2577      }
2578      break;
2579
2580   case TGSI_OPCODE_SIN:
2581      FETCH( &r[0], 0, CHAN_X );
2582      micro_sin( &r[0], &r[0] );
2583      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2584         STORE( &r[0], 0, chan_index );
2585      }
2586      break;
2587
2588   case TGSI_OPCODE_SLE:
2589      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2590         FETCH( &r[0], 0, chan_index );
2591         FETCH( &r[1], 1, chan_index );
2592         micro_le(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2593      }
2594      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2595         STORE(&d[chan_index], 0, chan_index);
2596      }
2597      break;
2598
2599   case TGSI_OPCODE_SNE:
2600      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2601         FETCH( &r[0], 0, chan_index );
2602         FETCH( &r[1], 1, chan_index );
2603         micro_eq(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
2604      }
2605      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2606         STORE(&d[chan_index], 0, chan_index);
2607      }
2608      break;
2609
2610   case TGSI_OPCODE_STR:
2611      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2612         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, chan_index);
2613      }
2614      break;
2615
2616   case TGSI_OPCODE_TEX:
2617      /* simple texture lookup */
2618      /* src[0] = texcoord */
2619      /* src[1] = sampler unit */
2620      exec_tex(mach, inst, FALSE, FALSE);
2621      break;
2622
2623   case TGSI_OPCODE_TXB:
2624      /* Texture lookup with lod bias */
2625      /* src[0] = texcoord (src[0].w = LOD bias) */
2626      /* src[1] = sampler unit */
2627      exec_tex(mach, inst, TRUE, FALSE);
2628      break;
2629
2630   case TGSI_OPCODE_TXD:
2631      /* Texture lookup with explict partial derivatives */
2632      /* src[0] = texcoord */
2633      /* src[1] = d[strq]/dx */
2634      /* src[2] = d[strq]/dy */
2635      /* src[3] = sampler unit */
2636      exec_txd(mach, inst);
2637      break;
2638
2639   case TGSI_OPCODE_TXL:
2640      /* Texture lookup with explit LOD */
2641      /* src[0] = texcoord (src[0].w = LOD) */
2642      /* src[1] = sampler unit */
2643      exec_tex(mach, inst, TRUE, FALSE);
2644      break;
2645
2646   case TGSI_OPCODE_TXP:
2647      /* Texture lookup with projection */
2648      /* src[0] = texcoord (src[0].w = projection) */
2649      /* src[1] = sampler unit */
2650      exec_tex(mach, inst, FALSE, TRUE);
2651      break;
2652
2653   case TGSI_OPCODE_UP2H:
2654      assert (0);
2655      break;
2656
2657   case TGSI_OPCODE_UP2US:
2658      assert (0);
2659      break;
2660
2661   case TGSI_OPCODE_UP4B:
2662      assert (0);
2663      break;
2664
2665   case TGSI_OPCODE_UP4UB:
2666      assert (0);
2667      break;
2668
2669   case TGSI_OPCODE_X2D:
2670      FETCH(&r[0], 1, CHAN_X);
2671      FETCH(&r[1], 1, CHAN_Y);
2672      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2673          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2674         FETCH(&r[2], 2, CHAN_X);
2675         micro_mul(&r[2], &r[2], &r[0]);
2676         FETCH(&r[3], 2, CHAN_Y);
2677         micro_mul(&r[3], &r[3], &r[1]);
2678         micro_add(&r[2], &r[2], &r[3]);
2679         FETCH(&r[3], 0, CHAN_X);
2680         micro_add(&d[CHAN_X], &r[2], &r[3]);
2681
2682      }
2683      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2684          IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2685         FETCH(&r[2], 2, CHAN_Z);
2686         micro_mul(&r[2], &r[2], &r[0]);
2687         FETCH(&r[3], 2, CHAN_W);
2688         micro_mul(&r[3], &r[3], &r[1]);
2689         micro_add(&r[2], &r[2], &r[3]);
2690         FETCH(&r[3], 0, CHAN_Y);
2691         micro_add(&d[CHAN_Y], &r[2], &r[3]);
2692
2693      }
2694      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2695         STORE(&d[CHAN_X], 0, CHAN_X);
2696      }
2697      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2698         STORE(&d[CHAN_Y], 0, CHAN_Y);
2699      }
2700      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2701         STORE(&d[CHAN_X], 0, CHAN_Z);
2702      }
2703      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2704         STORE(&d[CHAN_Y], 0, CHAN_W);
2705      }
2706      break;
2707
2708   case TGSI_OPCODE_ARA:
2709      assert (0);
2710      break;
2711
2712   case TGSI_OPCODE_BRA:
2713      assert (0);
2714      break;
2715
2716   case TGSI_OPCODE_CAL:
2717      /* skip the call if no execution channels are enabled */
2718      if (mach->ExecMask) {
2719         /* do the call */
2720
2721         /* First, record the depths of the execution stacks.
2722          * This is important for deeply nested/looped return statements.
2723          * We have to unwind the stacks by the correct amount.  For a
2724          * real code generator, we could determine the number of entries
2725          * to pop off each stack with simple static analysis and avoid
2726          * implementing this data structure at run time.
2727          */
2728         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
2729         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
2730         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
2731         /* note that PC was already incremented above */
2732         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
2733
2734         mach->CallStackTop++;
2735
2736         /* Second, push the Cond, Loop, Cont, Func stacks */
2737         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2738         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2739         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2740         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2741         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2742         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2743         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
2744         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
2745
2746         /* Finally, jump to the subroutine */
2747         *pc = inst->InstructionExtLabel.Label;
2748      }
2749      break;
2750
2751   case TGSI_OPCODE_RET:
2752      mach->FuncMask &= ~mach->ExecMask;
2753      UPDATE_EXEC_MASK(mach);
2754
2755      if (mach->FuncMask == 0x0) {
2756         /* really return now (otherwise, keep executing */
2757
2758         if (mach->CallStackTop == 0) {
2759            /* returning from main() */
2760            *pc = -1;
2761            return;
2762         }
2763
2764         assert(mach->CallStackTop > 0);
2765         mach->CallStackTop--;
2766
2767         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
2768         mach->CondMask = mach->CondStack[mach->CondStackTop];
2769
2770         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
2771         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
2772
2773         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
2774         mach->ContMask = mach->ContStack[mach->ContStackTop];
2775
2776         assert(mach->FuncStackTop > 0);
2777         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
2778
2779         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
2780
2781         UPDATE_EXEC_MASK(mach);
2782      }
2783      break;
2784
2785   case TGSI_OPCODE_SSG:
2786   /* TGSI_OPCODE_SGN */
2787      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2788         FETCH( &r[0], 0, chan_index );
2789         micro_sgn(&d[chan_index], &r[0]);
2790      }
2791      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2792         STORE(&d[chan_index], 0, chan_index);
2793      }
2794      break;
2795
2796   case TGSI_OPCODE_CMP:
2797      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2798         FETCH(&r[0], 0, chan_index);
2799         FETCH(&r[1], 1, chan_index);
2800         FETCH(&r[2], 2, chan_index);
2801         micro_lt(&d[chan_index], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2]);
2802      }
2803      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2804         STORE(&d[chan_index], 0, chan_index);
2805      }
2806      break;
2807
2808   case TGSI_OPCODE_SCS:
2809      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2810         FETCH( &r[0], 0, CHAN_X );
2811         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2812            micro_cos(&r[1], &r[0]);
2813            STORE(&r[1], 0, CHAN_X);
2814         }
2815         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2816            micro_sin(&r[1], &r[0]);
2817            STORE(&r[1], 0, CHAN_Y);
2818         }
2819      }
2820      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2821         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
2822      }
2823      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
2824         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2825      }
2826      break;
2827
2828   case TGSI_OPCODE_NRM:
2829      /* 3-component vector normalize */
2830      if(IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2831         IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2832         IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2833         /* r3 = sqrt(dp3(src0, src0)) */
2834         FETCH(&r[0], 0, CHAN_X);
2835         micro_mul(&r[3], &r[0], &r[0]);
2836         FETCH(&r[1], 0, CHAN_Y);
2837         micro_mul(&r[4], &r[1], &r[1]);
2838         micro_add(&r[3], &r[3], &r[4]);
2839         FETCH(&r[2], 0, CHAN_Z);
2840         micro_mul(&r[4], &r[2], &r[2]);
2841         micro_add(&r[3], &r[3], &r[4]);
2842         micro_sqrt(&r[3], &r[3]);
2843
2844         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2845            micro_div(&r[0], &r[0], &r[3]);
2846            STORE(&r[0], 0, CHAN_X);
2847         }
2848         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2849            micro_div(&r[1], &r[1], &r[3]);
2850            STORE(&r[1], 0, CHAN_Y);
2851         }
2852         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2853            micro_div(&r[2], &r[2], &r[3]);
2854            STORE(&r[2], 0, CHAN_Z);
2855         }
2856      }
2857      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2858         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2859      }
2860      break;
2861
2862   case TGSI_OPCODE_NRM4:
2863      /* 4-component vector normalize */
2864      {
2865         union tgsi_exec_channel tmp, dot;
2866
2867         /* tmp = dp4(src0, src0): */
2868         FETCH( &r[0], 0, CHAN_X );
2869         micro_mul( &tmp, &r[0], &r[0] );
2870
2871         FETCH( &r[1], 0, CHAN_Y );
2872         micro_mul( &dot, &r[1], &r[1] );
2873         micro_add( &tmp, &tmp, &dot );
2874
2875         FETCH( &r[2], 0, CHAN_Z );
2876         micro_mul( &dot, &r[2], &r[2] );
2877         micro_add( &tmp, &tmp, &dot );
2878
2879         FETCH( &r[3], 0, CHAN_W );
2880         micro_mul( &dot, &r[3], &r[3] );
2881         micro_add( &tmp, &tmp, &dot );
2882
2883         /* tmp = 1 / sqrt(tmp) */
2884         micro_sqrt( &tmp, &tmp );
2885         micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
2886
2887         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2888            /* chan = chan * tmp */
2889            micro_mul( &r[chan_index], &tmp, &r[chan_index] );
2890            STORE( &r[chan_index], 0, chan_index );
2891         }
2892      }
2893      break;
2894
2895   case TGSI_OPCODE_DIV:
2896      assert( 0 );
2897      break;
2898
2899   case TGSI_OPCODE_DP2:
2900      FETCH( &r[0], 0, CHAN_X );
2901      FETCH( &r[1], 1, CHAN_X );
2902      micro_mul( &r[0], &r[0], &r[1] );
2903
2904      FETCH( &r[1], 0, CHAN_Y );
2905      FETCH( &r[2], 1, CHAN_Y );
2906      micro_mul( &r[1], &r[1], &r[2] );
2907      micro_add( &r[0], &r[0], &r[1] );
2908
2909      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2910         STORE( &r[0], 0, chan_index );
2911      }
2912      break;
2913
2914   case TGSI_OPCODE_IF:
2915      /* push CondMask */
2916      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2917      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2918      FETCH( &r[0], 0, CHAN_X );
2919      /* update CondMask */
2920      if( ! r[0].u[0] ) {
2921         mach->CondMask &= ~0x1;
2922      }
2923      if( ! r[0].u[1] ) {
2924         mach->CondMask &= ~0x2;
2925      }
2926      if( ! r[0].u[2] ) {
2927         mach->CondMask &= ~0x4;
2928      }
2929      if( ! r[0].u[3] ) {
2930         mach->CondMask &= ~0x8;
2931      }
2932      UPDATE_EXEC_MASK(mach);
2933      /* Todo: If CondMask==0, jump to ELSE */
2934      break;
2935
2936   case TGSI_OPCODE_ELSE:
2937      /* invert CondMask wrt previous mask */
2938      {
2939         uint prevMask;
2940         assert(mach->CondStackTop > 0);
2941         prevMask = mach->CondStack[mach->CondStackTop - 1];
2942         mach->CondMask = ~mach->CondMask & prevMask;
2943         UPDATE_EXEC_MASK(mach);
2944         /* Todo: If CondMask==0, jump to ENDIF */
2945      }
2946      break;
2947
2948   case TGSI_OPCODE_ENDIF:
2949      /* pop CondMask */
2950      assert(mach->CondStackTop > 0);
2951      mach->CondMask = mach->CondStack[--mach->CondStackTop];
2952      UPDATE_EXEC_MASK(mach);
2953      break;
2954
2955   case TGSI_OPCODE_END:
2956      /* halt execution */
2957      *pc = -1;
2958      break;
2959
2960   case TGSI_OPCODE_REP:
2961      assert (0);
2962      break;
2963
2964   case TGSI_OPCODE_ENDREP:
2965       assert (0);
2966       break;
2967
2968   case TGSI_OPCODE_PUSHA:
2969      assert (0);
2970      break;
2971
2972   case TGSI_OPCODE_POPA:
2973      assert (0);
2974      break;
2975
2976   case TGSI_OPCODE_CEIL:
2977      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2978         FETCH( &r[0], 0, chan_index );
2979         micro_ceil(&d[chan_index], &r[0]);
2980      }
2981      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2982         STORE(&d[chan_index], 0, chan_index);
2983      }
2984      break;
2985
2986   case TGSI_OPCODE_I2F:
2987      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2988         FETCH( &r[0], 0, chan_index );
2989         micro_i2f(&d[chan_index], &r[0]);
2990      }
2991      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2992         STORE(&d[chan_index], 0, chan_index);
2993      }
2994      break;
2995
2996   case TGSI_OPCODE_NOT:
2997      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2998         FETCH( &r[0], 0, chan_index );
2999         micro_not(&d[chan_index], &r[0]);
3000      }
3001      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3002         STORE(&d[chan_index], 0, chan_index);
3003      }
3004      break;
3005
3006   case TGSI_OPCODE_TRUNC:
3007      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3008         FETCH( &r[0], 0, chan_index );
3009         micro_trunc(&d[chan_index], &r[0]);
3010      }
3011      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3012         STORE(&d[chan_index], 0, chan_index);
3013      }
3014      break;
3015
3016   case TGSI_OPCODE_SHL:
3017      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3018         FETCH( &r[0], 0, chan_index );
3019         FETCH( &r[1], 1, chan_index );
3020         micro_shl(&d[chan_index], &r[0], &r[1]);
3021      }
3022      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3023         STORE(&d[chan_index], 0, chan_index);
3024      }
3025      break;
3026
3027   case TGSI_OPCODE_SHR:
3028      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3029         FETCH( &r[0], 0, chan_index );
3030         FETCH( &r[1], 1, chan_index );
3031         micro_ishr(&d[chan_index], &r[0], &r[1]);
3032      }
3033      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3034         STORE(&d[chan_index], 0, chan_index);
3035      }
3036      break;
3037
3038   case TGSI_OPCODE_AND:
3039      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3040         FETCH( &r[0], 0, chan_index );
3041         FETCH( &r[1], 1, chan_index );
3042         micro_and(&d[chan_index], &r[0], &r[1]);
3043      }
3044      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3045         STORE(&d[chan_index], 0, chan_index);
3046      }
3047      break;
3048
3049   case TGSI_OPCODE_OR:
3050      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3051         FETCH( &r[0], 0, chan_index );
3052         FETCH( &r[1], 1, chan_index );
3053         micro_or(&d[chan_index], &r[0], &r[1]);
3054      }
3055      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3056         STORE(&d[chan_index], 0, chan_index);
3057      }
3058      break;
3059
3060   case TGSI_OPCODE_MOD:
3061      assert (0);
3062      break;
3063
3064   case TGSI_OPCODE_XOR:
3065      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3066         FETCH( &r[0], 0, chan_index );
3067         FETCH( &r[1], 1, chan_index );
3068         micro_xor(&d[chan_index], &r[0], &r[1]);
3069      }
3070      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3071         STORE(&d[chan_index], 0, chan_index);
3072      }
3073      break;
3074
3075   case TGSI_OPCODE_SAD:
3076      assert (0);
3077      break;
3078
3079   case TGSI_OPCODE_TXF:
3080      assert (0);
3081      break;
3082
3083   case TGSI_OPCODE_TXQ:
3084      assert (0);
3085      break;
3086
3087   case TGSI_OPCODE_EMIT:
3088      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
3089      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
3090      break;
3091
3092   case TGSI_OPCODE_ENDPRIM:
3093      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
3094      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
3095      break;
3096
3097   case TGSI_OPCODE_BGNFOR:
3098      assert(mach->LoopCounterStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3099      for (chan_index = 0; chan_index < 3; chan_index++) {
3100         FETCH( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[chan_index], 0, chan_index );
3101      }
3102      ++mach->LoopCounterStackTop;
3103      STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X], 0, CHAN_X);
3104      /* update LoopMask */
3105      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3106         mach->LoopMask &= ~0x1;
3107      }
3108      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3109         mach->LoopMask &= ~0x2;
3110      }
3111      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3112         mach->LoopMask &= ~0x4;
3113      }
3114      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3115         mach->LoopMask &= ~0x8;
3116      }
3117      /* TODO: if mach->LoopMask == 0, jump to end of loop */
3118      UPDATE_EXEC_MASK(mach);
3119      /* fall-through (for now) */
3120   case TGSI_OPCODE_BGNLOOP:
3121      /* push LoopMask and ContMasks */
3122      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3123      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3124      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3125      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3126      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3127      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3128      break;
3129
3130   case TGSI_OPCODE_ENDFOR:
3131      assert(mach->LoopCounterStackTop > 0);
3132      micro_sub(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3133                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3134                &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
3135      /* update LoopMask */
3136      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3137         mach->LoopMask &= ~0x1;
3138      }
3139      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3140         mach->LoopMask &= ~0x2;
3141      }
3142      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3143         mach->LoopMask &= ~0x4;
3144      }
3145      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3146         mach->LoopMask &= ~0x8;
3147      }
3148      micro_add(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3149                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3150                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Z]);
3151      assert(mach->LoopLabelStackTop > 0);
3152      inst = mach->Instructions + mach->LoopLabelStack[mach->LoopLabelStackTop - 1];
3153      STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_X], 0, CHAN_X);
3154      /* Restore ContMask, but don't pop */
3155      assert(mach->ContStackTop > 0);
3156      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3157      UPDATE_EXEC_MASK(mach);
3158      if (mach->ExecMask) {
3159         /* repeat loop: jump to instruction just past BGNLOOP */
3160         assert(mach->LoopLabelStackTop > 0);
3161         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3162      }
3163      else {
3164         /* exit loop: pop LoopMask */
3165         assert(mach->LoopStackTop > 0);
3166         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3167         /* pop ContMask */
3168         assert(mach->ContStackTop > 0);
3169         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3170         assert(mach->LoopLabelStackTop > 0);
3171         --mach->LoopLabelStackTop;
3172         assert(mach->LoopCounterStackTop > 0);
3173         --mach->LoopCounterStackTop;
3174      }
3175      UPDATE_EXEC_MASK(mach);
3176      break;
3177
3178   case TGSI_OPCODE_ENDLOOP:
3179      /* Restore ContMask, but don't pop */
3180      assert(mach->ContStackTop > 0);
3181      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3182      UPDATE_EXEC_MASK(mach);
3183      if (mach->ExecMask) {
3184         /* repeat loop: jump to instruction just past BGNLOOP */
3185         assert(mach->LoopLabelStackTop > 0);
3186         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3187      }
3188      else {
3189         /* exit loop: pop LoopMask */
3190         assert(mach->LoopStackTop > 0);
3191         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3192         /* pop ContMask */
3193         assert(mach->ContStackTop > 0);
3194         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3195         assert(mach->LoopLabelStackTop > 0);
3196         --mach->LoopLabelStackTop;
3197      }
3198      UPDATE_EXEC_MASK(mach);
3199      break;
3200
3201   case TGSI_OPCODE_BRK:
3202      /* turn off loop channels for each enabled exec channel */
3203      mach->LoopMask &= ~mach->ExecMask;
3204      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3205      UPDATE_EXEC_MASK(mach);
3206      break;
3207
3208   case TGSI_OPCODE_CONT:
3209      /* turn off cont channels for each enabled exec channel */
3210      mach->ContMask &= ~mach->ExecMask;
3211      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3212      UPDATE_EXEC_MASK(mach);
3213      break;
3214
3215   case TGSI_OPCODE_BGNSUB:
3216      /* no-op */
3217      break;
3218
3219   case TGSI_OPCODE_ENDSUB:
3220      /*
3221       * XXX: This really should be a no-op. We should never reach this opcode.
3222       */
3223
3224      assert(mach->CallStackTop > 0);
3225      mach->CallStackTop--;
3226
3227      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3228      mach->CondMask = mach->CondStack[mach->CondStackTop];
3229
3230      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3231      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3232
3233      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3234      mach->ContMask = mach->ContStack[mach->ContStackTop];
3235
3236      assert(mach->FuncStackTop > 0);
3237      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3238
3239      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3240
3241      UPDATE_EXEC_MASK(mach);
3242      break;
3243
3244   case TGSI_OPCODE_NOP:
3245      break;
3246
3247   default:
3248      assert( 0 );
3249   }
3250}
3251
3252#define DEBUG_EXECUTION 0
3253
3254
3255/**
3256 * Run TGSI interpreter.
3257 * \return bitmask of "alive" quad components
3258 */
3259uint
3260tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
3261{
3262   uint i;
3263   int pc = 0;
3264
3265   mach->CondMask = 0xf;
3266   mach->LoopMask = 0xf;
3267   mach->ContMask = 0xf;
3268   mach->FuncMask = 0xf;
3269   mach->ExecMask = 0xf;
3270
3271   assert(mach->CondStackTop == 0);
3272   assert(mach->LoopStackTop == 0);
3273   assert(mach->ContStackTop == 0);
3274   assert(mach->CallStackTop == 0);
3275
3276   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
3277   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
3278
3279   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
3280      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
3281      mach->Primitives[0] = 0;
3282   }
3283
3284   for (i = 0; i < QUAD_SIZE; i++) {
3285      mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
3286         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
3287         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
3288         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
3289         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
3290   }
3291
3292   /* execute declarations (interpolants) */
3293   for (i = 0; i < mach->NumDeclarations; i++) {
3294      exec_declaration( mach, mach->Declarations+i );
3295   }
3296
3297   {
3298#if DEBUG_EXECUTION
3299      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
3300      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
3301      uint inst = 1;
3302
3303      memcpy(temps, mach->Temps, sizeof(temps));
3304      memcpy(outputs, mach->Outputs, sizeof(outputs));
3305#endif
3306
3307      /* execute instructions, until pc is set to -1 */
3308      while (pc != -1) {
3309
3310#if DEBUG_EXECUTION
3311         uint i;
3312
3313         tgsi_dump_instruction(&mach->Instructions[pc], inst++);
3314#endif
3315
3316         assert(pc < (int) mach->NumInstructions);
3317         exec_instruction(mach, mach->Instructions + pc, &pc);
3318
3319#if DEBUG_EXECUTION
3320         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
3321            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
3322               uint j;
3323
3324               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
3325               debug_printf("TEMP[%2u] = ", i);
3326               for (j = 0; j < 4; j++) {
3327                  if (j > 0) {
3328                     debug_printf("           ");
3329                  }
3330                  debug_printf("(%6f, %6f, %6f, %6f)\n",
3331                               temps[i].xyzw[0].f[j],
3332                               temps[i].xyzw[1].f[j],
3333                               temps[i].xyzw[2].f[j],
3334                               temps[i].xyzw[3].f[j]);
3335               }
3336            }
3337         }
3338         for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
3339            if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
3340               uint j;
3341
3342               memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
3343               debug_printf("OUT[%2u] =  ", i);
3344               for (j = 0; j < 4; j++) {
3345                  if (j > 0) {
3346                     debug_printf("           ");
3347                  }
3348                  debug_printf("{%6f, %6f, %6f, %6f}\n",
3349                               outputs[i].xyzw[0].f[j],
3350                               outputs[i].xyzw[1].f[j],
3351                               outputs[i].xyzw[2].f[j],
3352                               outputs[i].xyzw[3].f[j]);
3353               }
3354            }
3355         }
3356#endif
3357      }
3358   }
3359
3360#if 0
3361   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
3362   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
3363      /*
3364       * Scale back depth component.
3365       */
3366      for (i = 0; i < 4; i++)
3367         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
3368   }
3369#endif
3370
3371   assert(mach->CondStackTop == 0);
3372   assert(mach->LoopStackTop == 0);
3373   assert(mach->ContStackTop == 0);
3374   assert(mach->CallStackTop == 0);
3375
3376   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3377}
3378