tgsi_exec.c revision b9cb74c7f826dfd320f5e5b54aa933898f7ddd3d
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28/**
29 * TGSI interpreter/executor.
30 *
31 * Flow control information:
32 *
33 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
34 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
35 * care since a condition may be true for some quad components but false
36 * for other components.
37 *
38 * We basically execute all statements (even if they're in the part of
39 * an IF/ELSE clause that's "not taken") and use a special mask to
40 * control writing to destination registers.  This is the ExecMask.
41 * See store_dest().
42 *
43 * The ExecMask is computed from three other masks (CondMask, LoopMask and
44 * ContMask) which are controlled by the flow control instructions (namely:
45 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
46 *
47 *
48 * Authors:
49 *   Michal Krol
50 *   Brian Paul
51 */
52
53#include "pipe/p_compiler.h"
54#include "pipe/p_state.h"
55#include "pipe/p_shader_tokens.h"
56#include "tgsi/tgsi_dump.h"
57#include "tgsi/tgsi_parse.h"
58#include "tgsi/tgsi_util.h"
59#include "tgsi_exec.h"
60#include "util/u_memory.h"
61#include "util/u_math.h"
62
63#define FAST_MATH 1
64
65/** for tgsi_full_instruction::Flags */
66#define SOA_DEPENDENCY_FLAG 0x1
67
68#define TILE_TOP_LEFT     0
69#define TILE_TOP_RIGHT    1
70#define TILE_BOTTOM_LEFT  2
71#define TILE_BOTTOM_RIGHT 3
72
73#define CHAN_X  0
74#define CHAN_Y  1
75#define CHAN_Z  2
76#define CHAN_W  3
77
78/*
79 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
80 */
81#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
82#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
83#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
84#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
85#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
86#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
87#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
88#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
89#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
90#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
91#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
92#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
93#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
94#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
95#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
96#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
97#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
98#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
99#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
100#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
101#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
102#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
103#define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
104#define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
105#define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
106#define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
107#define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
108#define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
109#define TEMP_R0            TGSI_EXEC_TEMP_R0
110
111#define IS_CHANNEL_ENABLED(INST, CHAN)\
112   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
113
114#define IS_CHANNEL_ENABLED2(INST, CHAN)\
115   ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
116
117#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
118   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
119      if (IS_CHANNEL_ENABLED( INST, CHAN ))
120
121#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
122   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
123      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
124
125
126/** The execution mask depends on the conditional mask and the loop mask */
127#define UPDATE_EXEC_MASK(MACH) \
128      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
129
130
131static const union tgsi_exec_channel ZeroVec =
132   { { 0.0, 0.0, 0.0, 0.0 } };
133
134
135#ifdef DEBUG
136static void
137check_inf_or_nan(const union tgsi_exec_channel *chan)
138{
139   assert(!util_is_inf_or_nan(chan->f[0]));
140   assert(!util_is_inf_or_nan(chan->f[1]));
141   assert(!util_is_inf_or_nan(chan->f[2]));
142   assert(!util_is_inf_or_nan(chan->f[3]));
143}
144#endif
145
146
147#ifdef DEBUG
148static void
149print_chan(const char *msg, const union tgsi_exec_channel *chan)
150{
151   debug_printf("%s = {%f, %f, %f, %f}\n",
152                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
153}
154#endif
155
156
157#ifdef DEBUG
158static void
159print_temp(const struct tgsi_exec_machine *mach, uint index)
160{
161   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
162   int i;
163   debug_printf("Temp[%u] =\n", index);
164   for (i = 0; i < 4; i++) {
165      debug_printf("  %c: { %f, %f, %f, %f }\n",
166                   "XYZW"[i],
167                   tmp->xyzw[i].f[0],
168                   tmp->xyzw[i].f[1],
169                   tmp->xyzw[i].f[2],
170                   tmp->xyzw[i].f[3]);
171   }
172}
173#endif
174
175
176/**
177 * Check if there's a potential src/dst register data dependency when
178 * using SOA execution.
179 * Example:
180 *   MOV T, T.yxwz;
181 * This would expand into:
182 *   MOV t0, t1;
183 *   MOV t1, t0;
184 *   MOV t2, t3;
185 *   MOV t3, t2;
186 * The second instruction will have the wrong value for t0 if executed as-is.
187 */
188boolean
189tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
190{
191   uint i, chan;
192
193   uint writemask = inst->FullDstRegisters[0].DstRegister.WriteMask;
194   if (writemask == TGSI_WRITEMASK_X ||
195       writemask == TGSI_WRITEMASK_Y ||
196       writemask == TGSI_WRITEMASK_Z ||
197       writemask == TGSI_WRITEMASK_W ||
198       writemask == TGSI_WRITEMASK_NONE) {
199      /* no chance of data dependency */
200      return FALSE;
201   }
202
203   /* loop over src regs */
204   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
205      if ((inst->FullSrcRegisters[i].SrcRegister.File ==
206           inst->FullDstRegisters[0].DstRegister.File) &&
207          (inst->FullSrcRegisters[i].SrcRegister.Index ==
208           inst->FullDstRegisters[0].DstRegister.Index)) {
209         /* loop over dest channels */
210         uint channelsWritten = 0x0;
211         FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
212            /* check if we're reading a channel that's been written */
213            uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->FullSrcRegisters[i], chan);
214            if (channelsWritten & (1 << swizzle)) {
215               return TRUE;
216            }
217
218            channelsWritten |= (1 << chan);
219         }
220      }
221   }
222   return FALSE;
223}
224
225
226/**
227 * Initialize machine state by expanding tokens to full instructions,
228 * allocating temporary storage, setting up constants, etc.
229 * After this, we can call tgsi_exec_machine_run() many times.
230 */
231void
232tgsi_exec_machine_bind_shader(
233   struct tgsi_exec_machine *mach,
234   const struct tgsi_token *tokens,
235   uint numSamplers,
236   struct tgsi_sampler **samplers)
237{
238   uint k;
239   struct tgsi_parse_context parse;
240   struct tgsi_exec_labels *labels = &mach->Labels;
241   struct tgsi_full_instruction *instructions;
242   struct tgsi_full_declaration *declarations;
243   uint maxInstructions = 10, numInstructions = 0;
244   uint maxDeclarations = 10, numDeclarations = 0;
245   uint instno = 0;
246
247#if 0
248   tgsi_dump(tokens, 0);
249#endif
250
251   util_init_math();
252
253   mach->Tokens = tokens;
254   mach->Samplers = samplers;
255
256   k = tgsi_parse_init (&parse, mach->Tokens);
257   if (k != TGSI_PARSE_OK) {
258      debug_printf( "Problem parsing!\n" );
259      return;
260   }
261
262   mach->Processor = parse.FullHeader.Processor.Processor;
263   mach->ImmLimit = 0;
264   labels->count = 0;
265
266   declarations = (struct tgsi_full_declaration *)
267      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
268
269   if (!declarations) {
270      return;
271   }
272
273   instructions = (struct tgsi_full_instruction *)
274      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
275
276   if (!instructions) {
277      FREE( declarations );
278      return;
279   }
280
281   while( !tgsi_parse_end_of_tokens( &parse ) ) {
282      uint pointer = parse.Position;
283      uint i;
284
285      tgsi_parse_token( &parse );
286      switch( parse.FullToken.Token.Type ) {
287      case TGSI_TOKEN_TYPE_DECLARATION:
288         /* save expanded declaration */
289         if (numDeclarations == maxDeclarations) {
290            declarations = REALLOC(declarations,
291                                   maxDeclarations
292                                   * sizeof(struct tgsi_full_declaration),
293                                   (maxDeclarations + 10)
294                                   * sizeof(struct tgsi_full_declaration));
295            maxDeclarations += 10;
296         }
297         memcpy(declarations + numDeclarations,
298                &parse.FullToken.FullDeclaration,
299                sizeof(declarations[0]));
300         numDeclarations++;
301         break;
302
303      case TGSI_TOKEN_TYPE_IMMEDIATE:
304         {
305            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
306            assert( size <= 4 );
307            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
308
309            for( i = 0; i < size; i++ ) {
310               mach->Imms[mach->ImmLimit][i] =
311		  parse.FullToken.FullImmediate.u[i].Float;
312            }
313            mach->ImmLimit += 1;
314         }
315         break;
316
317      case TGSI_TOKEN_TYPE_INSTRUCTION:
318         assert( labels->count < MAX_LABELS );
319
320         labels->labels[labels->count][0] = instno;
321         labels->labels[labels->count][1] = pointer;
322         labels->count++;
323
324         /* save expanded instruction */
325         if (numInstructions == maxInstructions) {
326            instructions = REALLOC(instructions,
327                                   maxInstructions
328                                   * sizeof(struct tgsi_full_instruction),
329                                   (maxInstructions + 10)
330                                   * sizeof(struct tgsi_full_instruction));
331            maxInstructions += 10;
332         }
333
334         if (tgsi_check_soa_dependencies(&parse.FullToken.FullInstruction)) {
335            uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
336            parse.FullToken.FullInstruction.Flags = SOA_DEPENDENCY_FLAG;
337            /* XXX we only handle SOA dependencies properly for MOV/SWZ
338             * at this time!
339             */
340            if (opcode != TGSI_OPCODE_MOV) {
341               debug_printf("Warning: SOA dependency in instruction"
342                            " is not handled:\n");
343               tgsi_dump_instruction(&parse.FullToken.FullInstruction,
344                                     numInstructions);
345            }
346         }
347
348         memcpy(instructions + numInstructions,
349                &parse.FullToken.FullInstruction,
350                sizeof(instructions[0]));
351
352         numInstructions++;
353         break;
354
355      default:
356         assert( 0 );
357      }
358   }
359   tgsi_parse_free (&parse);
360
361   if (mach->Declarations) {
362      FREE( mach->Declarations );
363   }
364   mach->Declarations = declarations;
365   mach->NumDeclarations = numDeclarations;
366
367   if (mach->Instructions) {
368      FREE( mach->Instructions );
369   }
370   mach->Instructions = instructions;
371   mach->NumInstructions = numInstructions;
372}
373
374
375struct tgsi_exec_machine *
376tgsi_exec_machine_create( void )
377{
378   struct tgsi_exec_machine *mach;
379   uint i;
380
381   mach = align_malloc( sizeof *mach, 16 );
382   if (!mach)
383      goto fail;
384
385   memset(mach, 0, sizeof(*mach));
386
387   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
388
389   /* Setup constants. */
390   for( i = 0; i < 4; i++ ) {
391      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
392      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
393      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
394      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
395      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
396      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
397      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
398      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
399      mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
400      mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
401   }
402
403#ifdef DEBUG
404   /* silence warnings */
405   (void) print_chan;
406   (void) print_temp;
407#endif
408
409   return mach;
410
411fail:
412   align_free(mach);
413   return NULL;
414}
415
416
417void
418tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
419{
420   if (mach) {
421      FREE(mach->Instructions);
422      FREE(mach->Declarations);
423   }
424
425   align_free(mach);
426}
427
428
429static void
430micro_abs(
431   union tgsi_exec_channel *dst,
432   const union tgsi_exec_channel *src )
433{
434   dst->f[0] = fabsf( src->f[0] );
435   dst->f[1] = fabsf( src->f[1] );
436   dst->f[2] = fabsf( src->f[2] );
437   dst->f[3] = fabsf( src->f[3] );
438}
439
440static void
441micro_add(
442   union tgsi_exec_channel *dst,
443   const union tgsi_exec_channel *src0,
444   const union tgsi_exec_channel *src1 )
445{
446   dst->f[0] = src0->f[0] + src1->f[0];
447   dst->f[1] = src0->f[1] + src1->f[1];
448   dst->f[2] = src0->f[2] + src1->f[2];
449   dst->f[3] = src0->f[3] + src1->f[3];
450}
451
452#if 0
453static void
454micro_iadd(
455   union tgsi_exec_channel *dst,
456   const union tgsi_exec_channel *src0,
457   const union tgsi_exec_channel *src1 )
458{
459   dst->i[0] = src0->i[0] + src1->i[0];
460   dst->i[1] = src0->i[1] + src1->i[1];
461   dst->i[2] = src0->i[2] + src1->i[2];
462   dst->i[3] = src0->i[3] + src1->i[3];
463}
464#endif
465
466static void
467micro_and(
468   union tgsi_exec_channel *dst,
469   const union tgsi_exec_channel *src0,
470   const union tgsi_exec_channel *src1 )
471{
472   dst->u[0] = src0->u[0] & src1->u[0];
473   dst->u[1] = src0->u[1] & src1->u[1];
474   dst->u[2] = src0->u[2] & src1->u[2];
475   dst->u[3] = src0->u[3] & src1->u[3];
476}
477
478static void
479micro_ceil(
480   union tgsi_exec_channel *dst,
481   const union tgsi_exec_channel *src )
482{
483   dst->f[0] = ceilf( src->f[0] );
484   dst->f[1] = ceilf( src->f[1] );
485   dst->f[2] = ceilf( src->f[2] );
486   dst->f[3] = ceilf( src->f[3] );
487}
488
489static void
490micro_cos(
491   union tgsi_exec_channel *dst,
492   const union tgsi_exec_channel *src )
493{
494   dst->f[0] = cosf( src->f[0] );
495   dst->f[1] = cosf( src->f[1] );
496   dst->f[2] = cosf( src->f[2] );
497   dst->f[3] = cosf( src->f[3] );
498}
499
500static void
501micro_ddx(
502   union tgsi_exec_channel *dst,
503   const union tgsi_exec_channel *src )
504{
505   dst->f[0] =
506   dst->f[1] =
507   dst->f[2] =
508   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
509}
510
511static void
512micro_ddy(
513   union tgsi_exec_channel *dst,
514   const union tgsi_exec_channel *src )
515{
516   dst->f[0] =
517   dst->f[1] =
518   dst->f[2] =
519   dst->f[3] = src->f[TILE_TOP_LEFT] - src->f[TILE_BOTTOM_LEFT];
520}
521
522static void
523micro_div(
524   union tgsi_exec_channel *dst,
525   const union tgsi_exec_channel *src0,
526   const union tgsi_exec_channel *src1 )
527{
528   if (src1->f[0] != 0) {
529      dst->f[0] = src0->f[0] / src1->f[0];
530   }
531   if (src1->f[1] != 0) {
532      dst->f[1] = src0->f[1] / src1->f[1];
533   }
534   if (src1->f[2] != 0) {
535      dst->f[2] = src0->f[2] / src1->f[2];
536   }
537   if (src1->f[3] != 0) {
538      dst->f[3] = src0->f[3] / src1->f[3];
539   }
540}
541
542#if 0
543static void
544micro_udiv(
545   union tgsi_exec_channel *dst,
546   const union tgsi_exec_channel *src0,
547   const union tgsi_exec_channel *src1 )
548{
549   dst->u[0] = src0->u[0] / src1->u[0];
550   dst->u[1] = src0->u[1] / src1->u[1];
551   dst->u[2] = src0->u[2] / src1->u[2];
552   dst->u[3] = src0->u[3] / src1->u[3];
553}
554#endif
555
556static void
557micro_eq(
558   union tgsi_exec_channel *dst,
559   const union tgsi_exec_channel *src0,
560   const union tgsi_exec_channel *src1,
561   const union tgsi_exec_channel *src2,
562   const union tgsi_exec_channel *src3 )
563{
564   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
565   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
566   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
567   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
568}
569
570#if 0
571static void
572micro_ieq(
573   union tgsi_exec_channel *dst,
574   const union tgsi_exec_channel *src0,
575   const union tgsi_exec_channel *src1,
576   const union tgsi_exec_channel *src2,
577   const union tgsi_exec_channel *src3 )
578{
579   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
580   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
581   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
582   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
583}
584#endif
585
586static void
587micro_exp2(
588   union tgsi_exec_channel *dst,
589   const union tgsi_exec_channel *src)
590{
591#if FAST_MATH
592   dst->f[0] = util_fast_exp2( src->f[0] );
593   dst->f[1] = util_fast_exp2( src->f[1] );
594   dst->f[2] = util_fast_exp2( src->f[2] );
595   dst->f[3] = util_fast_exp2( src->f[3] );
596#else
597   dst->f[0] = powf( 2.0f, src->f[0] );
598   dst->f[1] = powf( 2.0f, src->f[1] );
599   dst->f[2] = powf( 2.0f, src->f[2] );
600   dst->f[3] = powf( 2.0f, src->f[3] );
601#endif
602}
603
604#if 0
605static void
606micro_f2ut(
607   union tgsi_exec_channel *dst,
608   const union tgsi_exec_channel *src )
609{
610   dst->u[0] = (uint) src->f[0];
611   dst->u[1] = (uint) src->f[1];
612   dst->u[2] = (uint) src->f[2];
613   dst->u[3] = (uint) src->f[3];
614}
615#endif
616
617static void
618micro_float_clamp(union tgsi_exec_channel *dst,
619                  const union tgsi_exec_channel *src)
620{
621   uint i;
622
623   for (i = 0; i < 4; i++) {
624      if (src->f[i] > 0.0f) {
625         if (src->f[i] > 1.884467e+019f)
626            dst->f[i] = 1.884467e+019f;
627         else if (src->f[i] < 5.42101e-020f)
628            dst->f[i] = 5.42101e-020f;
629         else
630            dst->f[i] = src->f[i];
631      }
632      else {
633         if (src->f[i] < -1.884467e+019f)
634            dst->f[i] = -1.884467e+019f;
635         else if (src->f[i] > -5.42101e-020f)
636            dst->f[i] = -5.42101e-020f;
637         else
638            dst->f[i] = src->f[i];
639      }
640   }
641}
642
643static void
644micro_flr(
645   union tgsi_exec_channel *dst,
646   const union tgsi_exec_channel *src )
647{
648   dst->f[0] = floorf( src->f[0] );
649   dst->f[1] = floorf( src->f[1] );
650   dst->f[2] = floorf( src->f[2] );
651   dst->f[3] = floorf( src->f[3] );
652}
653
654static void
655micro_frc(
656   union tgsi_exec_channel *dst,
657   const union tgsi_exec_channel *src )
658{
659   dst->f[0] = src->f[0] - floorf( src->f[0] );
660   dst->f[1] = src->f[1] - floorf( src->f[1] );
661   dst->f[2] = src->f[2] - floorf( src->f[2] );
662   dst->f[3] = src->f[3] - floorf( src->f[3] );
663}
664
665static void
666micro_i2f(
667   union tgsi_exec_channel *dst,
668   const union tgsi_exec_channel *src )
669{
670   dst->f[0] = (float) src->i[0];
671   dst->f[1] = (float) src->i[1];
672   dst->f[2] = (float) src->i[2];
673   dst->f[3] = (float) src->i[3];
674}
675
676static void
677micro_lg2(
678   union tgsi_exec_channel *dst,
679   const union tgsi_exec_channel *src )
680{
681#if FAST_MATH
682   dst->f[0] = util_fast_log2( src->f[0] );
683   dst->f[1] = util_fast_log2( src->f[1] );
684   dst->f[2] = util_fast_log2( src->f[2] );
685   dst->f[3] = util_fast_log2( src->f[3] );
686#else
687   dst->f[0] = logf( src->f[0] ) * 1.442695f;
688   dst->f[1] = logf( src->f[1] ) * 1.442695f;
689   dst->f[2] = logf( src->f[2] ) * 1.442695f;
690   dst->f[3] = logf( src->f[3] ) * 1.442695f;
691#endif
692}
693
694static void
695micro_le(
696   union tgsi_exec_channel *dst,
697   const union tgsi_exec_channel *src0,
698   const union tgsi_exec_channel *src1,
699   const union tgsi_exec_channel *src2,
700   const union tgsi_exec_channel *src3 )
701{
702   dst->f[0] = src0->f[0] <= src1->f[0] ? src2->f[0] : src3->f[0];
703   dst->f[1] = src0->f[1] <= src1->f[1] ? src2->f[1] : src3->f[1];
704   dst->f[2] = src0->f[2] <= src1->f[2] ? src2->f[2] : src3->f[2];
705   dst->f[3] = src0->f[3] <= src1->f[3] ? src2->f[3] : src3->f[3];
706}
707
708static void
709micro_lt(
710   union tgsi_exec_channel *dst,
711   const union tgsi_exec_channel *src0,
712   const union tgsi_exec_channel *src1,
713   const union tgsi_exec_channel *src2,
714   const union tgsi_exec_channel *src3 )
715{
716   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
717   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
718   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
719   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
720}
721
722#if 0
723static void
724micro_ilt(
725   union tgsi_exec_channel *dst,
726   const union tgsi_exec_channel *src0,
727   const union tgsi_exec_channel *src1,
728   const union tgsi_exec_channel *src2,
729   const union tgsi_exec_channel *src3 )
730{
731   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
732   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
733   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
734   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
735}
736#endif
737
738#if 0
739static void
740micro_ult(
741   union tgsi_exec_channel *dst,
742   const union tgsi_exec_channel *src0,
743   const union tgsi_exec_channel *src1,
744   const union tgsi_exec_channel *src2,
745   const union tgsi_exec_channel *src3 )
746{
747   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
748   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
749   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
750   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
751}
752#endif
753
754static void
755micro_max(
756   union tgsi_exec_channel *dst,
757   const union tgsi_exec_channel *src0,
758   const union tgsi_exec_channel *src1 )
759{
760   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
761   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
762   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
763   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
764}
765
766#if 0
767static void
768micro_imax(
769   union tgsi_exec_channel *dst,
770   const union tgsi_exec_channel *src0,
771   const union tgsi_exec_channel *src1 )
772{
773   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
774   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
775   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
776   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
777}
778#endif
779
780#if 0
781static void
782micro_umax(
783   union tgsi_exec_channel *dst,
784   const union tgsi_exec_channel *src0,
785   const union tgsi_exec_channel *src1 )
786{
787   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
788   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
789   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
790   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
791}
792#endif
793
794static void
795micro_min(
796   union tgsi_exec_channel *dst,
797   const union tgsi_exec_channel *src0,
798   const union tgsi_exec_channel *src1 )
799{
800   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
801   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
802   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
803   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
804}
805
806#if 0
807static void
808micro_imin(
809   union tgsi_exec_channel *dst,
810   const union tgsi_exec_channel *src0,
811   const union tgsi_exec_channel *src1 )
812{
813   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
814   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
815   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
816   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
817}
818#endif
819
820#if 0
821static void
822micro_umin(
823   union tgsi_exec_channel *dst,
824   const union tgsi_exec_channel *src0,
825   const union tgsi_exec_channel *src1 )
826{
827   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
828   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
829   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
830   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
831}
832#endif
833
834#if 0
835static void
836micro_umod(
837   union tgsi_exec_channel *dst,
838   const union tgsi_exec_channel *src0,
839   const union tgsi_exec_channel *src1 )
840{
841   dst->u[0] = src0->u[0] % src1->u[0];
842   dst->u[1] = src0->u[1] % src1->u[1];
843   dst->u[2] = src0->u[2] % src1->u[2];
844   dst->u[3] = src0->u[3] % src1->u[3];
845}
846#endif
847
848static void
849micro_mul(
850   union tgsi_exec_channel *dst,
851   const union tgsi_exec_channel *src0,
852   const union tgsi_exec_channel *src1 )
853{
854   dst->f[0] = src0->f[0] * src1->f[0];
855   dst->f[1] = src0->f[1] * src1->f[1];
856   dst->f[2] = src0->f[2] * src1->f[2];
857   dst->f[3] = src0->f[3] * src1->f[3];
858}
859
860#if 0
861static void
862micro_imul(
863   union tgsi_exec_channel *dst,
864   const union tgsi_exec_channel *src0,
865   const union tgsi_exec_channel *src1 )
866{
867   dst->i[0] = src0->i[0] * src1->i[0];
868   dst->i[1] = src0->i[1] * src1->i[1];
869   dst->i[2] = src0->i[2] * src1->i[2];
870   dst->i[3] = src0->i[3] * src1->i[3];
871}
872#endif
873
874#if 0
875static void
876micro_imul64(
877   union tgsi_exec_channel *dst0,
878   union tgsi_exec_channel *dst1,
879   const union tgsi_exec_channel *src0,
880   const union tgsi_exec_channel *src1 )
881{
882   dst1->i[0] = src0->i[0] * src1->i[0];
883   dst1->i[1] = src0->i[1] * src1->i[1];
884   dst1->i[2] = src0->i[2] * src1->i[2];
885   dst1->i[3] = src0->i[3] * src1->i[3];
886   dst0->i[0] = 0;
887   dst0->i[1] = 0;
888   dst0->i[2] = 0;
889   dst0->i[3] = 0;
890}
891#endif
892
893#if 0
894static void
895micro_umul64(
896   union tgsi_exec_channel *dst0,
897   union tgsi_exec_channel *dst1,
898   const union tgsi_exec_channel *src0,
899   const union tgsi_exec_channel *src1 )
900{
901   dst1->u[0] = src0->u[0] * src1->u[0];
902   dst1->u[1] = src0->u[1] * src1->u[1];
903   dst1->u[2] = src0->u[2] * src1->u[2];
904   dst1->u[3] = src0->u[3] * src1->u[3];
905   dst0->u[0] = 0;
906   dst0->u[1] = 0;
907   dst0->u[2] = 0;
908   dst0->u[3] = 0;
909}
910#endif
911
912
913#if 0
914static void
915micro_movc(
916   union tgsi_exec_channel *dst,
917   const union tgsi_exec_channel *src0,
918   const union tgsi_exec_channel *src1,
919   const union tgsi_exec_channel *src2 )
920{
921   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
922   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
923   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
924   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
925}
926#endif
927
928static void
929micro_neg(
930   union tgsi_exec_channel *dst,
931   const union tgsi_exec_channel *src )
932{
933   dst->f[0] = -src->f[0];
934   dst->f[1] = -src->f[1];
935   dst->f[2] = -src->f[2];
936   dst->f[3] = -src->f[3];
937}
938
939#if 0
940static void
941micro_ineg(
942   union tgsi_exec_channel *dst,
943   const union tgsi_exec_channel *src )
944{
945   dst->i[0] = -src->i[0];
946   dst->i[1] = -src->i[1];
947   dst->i[2] = -src->i[2];
948   dst->i[3] = -src->i[3];
949}
950#endif
951
952static void
953micro_not(
954   union tgsi_exec_channel *dst,
955   const union tgsi_exec_channel *src )
956{
957   dst->u[0] = ~src->u[0];
958   dst->u[1] = ~src->u[1];
959   dst->u[2] = ~src->u[2];
960   dst->u[3] = ~src->u[3];
961}
962
963static void
964micro_or(
965   union tgsi_exec_channel *dst,
966   const union tgsi_exec_channel *src0,
967   const union tgsi_exec_channel *src1 )
968{
969   dst->u[0] = src0->u[0] | src1->u[0];
970   dst->u[1] = src0->u[1] | src1->u[1];
971   dst->u[2] = src0->u[2] | src1->u[2];
972   dst->u[3] = src0->u[3] | src1->u[3];
973}
974
975static void
976micro_pow(
977   union tgsi_exec_channel *dst,
978   const union tgsi_exec_channel *src0,
979   const union tgsi_exec_channel *src1 )
980{
981#if FAST_MATH
982   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
983   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
984   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
985   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
986#else
987   dst->f[0] = powf( src0->f[0], src1->f[0] );
988   dst->f[1] = powf( src0->f[1], src1->f[1] );
989   dst->f[2] = powf( src0->f[2], src1->f[2] );
990   dst->f[3] = powf( src0->f[3], src1->f[3] );
991#endif
992}
993
994static void
995micro_rnd(
996   union tgsi_exec_channel *dst,
997   const union tgsi_exec_channel *src )
998{
999   dst->f[0] = floorf( src->f[0] + 0.5f );
1000   dst->f[1] = floorf( src->f[1] + 0.5f );
1001   dst->f[2] = floorf( src->f[2] + 0.5f );
1002   dst->f[3] = floorf( src->f[3] + 0.5f );
1003}
1004
1005static void
1006micro_sgn(
1007   union tgsi_exec_channel *dst,
1008   const union tgsi_exec_channel *src )
1009{
1010   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
1011   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
1012   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
1013   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
1014}
1015
1016static void
1017micro_shl(
1018   union tgsi_exec_channel *dst,
1019   const union tgsi_exec_channel *src0,
1020   const union tgsi_exec_channel *src1 )
1021{
1022   dst->i[0] = src0->i[0] << src1->i[0];
1023   dst->i[1] = src0->i[1] << src1->i[1];
1024   dst->i[2] = src0->i[2] << src1->i[2];
1025   dst->i[3] = src0->i[3] << src1->i[3];
1026}
1027
1028static void
1029micro_ishr(
1030   union tgsi_exec_channel *dst,
1031   const union tgsi_exec_channel *src0,
1032   const union tgsi_exec_channel *src1 )
1033{
1034   dst->i[0] = src0->i[0] >> src1->i[0];
1035   dst->i[1] = src0->i[1] >> src1->i[1];
1036   dst->i[2] = src0->i[2] >> src1->i[2];
1037   dst->i[3] = src0->i[3] >> src1->i[3];
1038}
1039
1040static void
1041micro_trunc(
1042   union tgsi_exec_channel *dst,
1043   const union tgsi_exec_channel *src0 )
1044{
1045   dst->f[0] = (float) (int) src0->f[0];
1046   dst->f[1] = (float) (int) src0->f[1];
1047   dst->f[2] = (float) (int) src0->f[2];
1048   dst->f[3] = (float) (int) src0->f[3];
1049}
1050
1051#if 0
1052static void
1053micro_ushr(
1054   union tgsi_exec_channel *dst,
1055   const union tgsi_exec_channel *src0,
1056   const union tgsi_exec_channel *src1 )
1057{
1058   dst->u[0] = src0->u[0] >> src1->u[0];
1059   dst->u[1] = src0->u[1] >> src1->u[1];
1060   dst->u[2] = src0->u[2] >> src1->u[2];
1061   dst->u[3] = src0->u[3] >> src1->u[3];
1062}
1063#endif
1064
1065static void
1066micro_sin(
1067   union tgsi_exec_channel *dst,
1068   const union tgsi_exec_channel *src )
1069{
1070   dst->f[0] = sinf( src->f[0] );
1071   dst->f[1] = sinf( src->f[1] );
1072   dst->f[2] = sinf( src->f[2] );
1073   dst->f[3] = sinf( src->f[3] );
1074}
1075
1076static void
1077micro_sqrt( union tgsi_exec_channel *dst,
1078            const union tgsi_exec_channel *src )
1079{
1080   dst->f[0] = sqrtf( src->f[0] );
1081   dst->f[1] = sqrtf( src->f[1] );
1082   dst->f[2] = sqrtf( src->f[2] );
1083   dst->f[3] = sqrtf( src->f[3] );
1084}
1085
1086static void
1087micro_sub(
1088   union tgsi_exec_channel *dst,
1089   const union tgsi_exec_channel *src0,
1090   const union tgsi_exec_channel *src1 )
1091{
1092   dst->f[0] = src0->f[0] - src1->f[0];
1093   dst->f[1] = src0->f[1] - src1->f[1];
1094   dst->f[2] = src0->f[2] - src1->f[2];
1095   dst->f[3] = src0->f[3] - src1->f[3];
1096}
1097
1098#if 0
1099static void
1100micro_u2f(
1101   union tgsi_exec_channel *dst,
1102   const union tgsi_exec_channel *src )
1103{
1104   dst->f[0] = (float) src->u[0];
1105   dst->f[1] = (float) src->u[1];
1106   dst->f[2] = (float) src->u[2];
1107   dst->f[3] = (float) src->u[3];
1108}
1109#endif
1110
1111static void
1112micro_xor(
1113   union tgsi_exec_channel *dst,
1114   const union tgsi_exec_channel *src0,
1115   const union tgsi_exec_channel *src1 )
1116{
1117   dst->u[0] = src0->u[0] ^ src1->u[0];
1118   dst->u[1] = src0->u[1] ^ src1->u[1];
1119   dst->u[2] = src0->u[2] ^ src1->u[2];
1120   dst->u[3] = src0->u[3] ^ src1->u[3];
1121}
1122
1123static void
1124fetch_src_file_channel(
1125   const struct tgsi_exec_machine *mach,
1126   const uint file,
1127   const uint swizzle,
1128   const union tgsi_exec_channel *index,
1129   union tgsi_exec_channel *chan )
1130{
1131   switch( swizzle ) {
1132   case TGSI_SWIZZLE_X:
1133   case TGSI_SWIZZLE_Y:
1134   case TGSI_SWIZZLE_Z:
1135   case TGSI_SWIZZLE_W:
1136      switch( file ) {
1137      case TGSI_FILE_CONSTANT:
1138         assert(mach->Consts);
1139         if (index->i[0] < 0)
1140            chan->f[0] = 0.0f;
1141         else
1142            chan->f[0] = mach->Consts[index->i[0]][swizzle];
1143         if (index->i[1] < 0)
1144            chan->f[1] = 0.0f;
1145         else
1146            chan->f[1] = mach->Consts[index->i[1]][swizzle];
1147         if (index->i[2] < 0)
1148            chan->f[2] = 0.0f;
1149         else
1150            chan->f[2] = mach->Consts[index->i[2]][swizzle];
1151         if (index->i[3] < 0)
1152            chan->f[3] = 0.0f;
1153         else
1154            chan->f[3] = mach->Consts[index->i[3]][swizzle];
1155         break;
1156
1157      case TGSI_FILE_INPUT:
1158         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
1159         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
1160         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
1161         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
1162         break;
1163
1164      case TGSI_FILE_TEMPORARY:
1165         assert(index->i[0] < TGSI_EXEC_NUM_TEMPS);
1166         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
1167         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
1168         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
1169         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
1170         break;
1171
1172      case TGSI_FILE_IMMEDIATE:
1173         assert( index->i[0] < (int) mach->ImmLimit );
1174         chan->f[0] = mach->Imms[index->i[0]][swizzle];
1175         assert( index->i[1] < (int) mach->ImmLimit );
1176         chan->f[1] = mach->Imms[index->i[1]][swizzle];
1177         assert( index->i[2] < (int) mach->ImmLimit );
1178         chan->f[2] = mach->Imms[index->i[2]][swizzle];
1179         assert( index->i[3] < (int) mach->ImmLimit );
1180         chan->f[3] = mach->Imms[index->i[3]][swizzle];
1181         break;
1182
1183      case TGSI_FILE_ADDRESS:
1184         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
1185         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
1186         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
1187         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
1188         break;
1189
1190      case TGSI_FILE_OUTPUT:
1191         /* vertex/fragment output vars can be read too */
1192         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
1193         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
1194         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
1195         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
1196         break;
1197
1198      default:
1199         assert( 0 );
1200      }
1201      break;
1202
1203   default:
1204      assert( 0 );
1205   }
1206}
1207
1208static void
1209fetch_source(
1210   const struct tgsi_exec_machine *mach,
1211   union tgsi_exec_channel *chan,
1212   const struct tgsi_full_src_register *reg,
1213   const uint chan_index )
1214{
1215   union tgsi_exec_channel index;
1216   uint swizzle;
1217
1218   /* We start with a direct index into a register file.
1219    *
1220    *    file[1],
1221    *    where:
1222    *       file = SrcRegister.File
1223    *       [1] = SrcRegister.Index
1224    */
1225   index.i[0] =
1226   index.i[1] =
1227   index.i[2] =
1228   index.i[3] = reg->SrcRegister.Index;
1229
1230   /* There is an extra source register that indirectly subscripts
1231    * a register file. The direct index now becomes an offset
1232    * that is being added to the indirect register.
1233    *
1234    *    file[ind[2].x+1],
1235    *    where:
1236    *       ind = SrcRegisterInd.File
1237    *       [2] = SrcRegisterInd.Index
1238    *       .x = SrcRegisterInd.SwizzleX
1239    */
1240   if (reg->SrcRegister.Indirect) {
1241      union tgsi_exec_channel index2;
1242      union tgsi_exec_channel indir_index;
1243      const uint execmask = mach->ExecMask;
1244      uint i;
1245
1246      /* which address register (always zero now) */
1247      index2.i[0] =
1248      index2.i[1] =
1249      index2.i[2] =
1250      index2.i[3] = reg->SrcRegisterInd.Index;
1251
1252      /* get current value of address register[swizzle] */
1253      swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterInd, CHAN_X );
1254      fetch_src_file_channel(
1255         mach,
1256         reg->SrcRegisterInd.File,
1257         swizzle,
1258         &index2,
1259         &indir_index );
1260
1261      /* add value of address register to the offset */
1262      index.i[0] += (int) indir_index.f[0];
1263      index.i[1] += (int) indir_index.f[1];
1264      index.i[2] += (int) indir_index.f[2];
1265      index.i[3] += (int) indir_index.f[3];
1266
1267      /* for disabled execution channels, zero-out the index to
1268       * avoid using a potential garbage value.
1269       */
1270      for (i = 0; i < QUAD_SIZE; i++) {
1271         if ((execmask & (1 << i)) == 0)
1272            index.i[i] = 0;
1273      }
1274   }
1275
1276   /* There is an extra source register that is a second
1277    * subscript to a register file. Effectively it means that
1278    * the register file is actually a 2D array of registers.
1279    *
1280    *    file[1][3] == file[1*sizeof(file[1])+3],
1281    *    where:
1282    *       [3] = SrcRegisterDim.Index
1283    */
1284   if (reg->SrcRegister.Dimension) {
1285      /* The size of the first-order array depends on the register file type.
1286       * We need to multiply the index to the first array to get an effective,
1287       * "flat" index that points to the beginning of the second-order array.
1288       */
1289      switch (reg->SrcRegister.File) {
1290      case TGSI_FILE_INPUT:
1291         index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1292         index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1293         index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1294         index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1295         break;
1296      case TGSI_FILE_CONSTANT:
1297         index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
1298         index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
1299         index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
1300         index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
1301         break;
1302      default:
1303         assert( 0 );
1304      }
1305
1306      index.i[0] += reg->SrcRegisterDim.Index;
1307      index.i[1] += reg->SrcRegisterDim.Index;
1308      index.i[2] += reg->SrcRegisterDim.Index;
1309      index.i[3] += reg->SrcRegisterDim.Index;
1310
1311      /* Again, the second subscript index can be addressed indirectly
1312       * identically to the first one.
1313       * Nothing stops us from indirectly addressing the indirect register,
1314       * but there is no need for that, so we won't exercise it.
1315       *
1316       *    file[1][ind[4].y+3],
1317       *    where:
1318       *       ind = SrcRegisterDimInd.File
1319       *       [4] = SrcRegisterDimInd.Index
1320       *       .y = SrcRegisterDimInd.SwizzleX
1321       */
1322      if (reg->SrcRegisterDim.Indirect) {
1323         union tgsi_exec_channel index2;
1324         union tgsi_exec_channel indir_index;
1325         const uint execmask = mach->ExecMask;
1326         uint i;
1327
1328         index2.i[0] =
1329         index2.i[1] =
1330         index2.i[2] =
1331         index2.i[3] = reg->SrcRegisterDimInd.Index;
1332
1333         swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
1334         fetch_src_file_channel(
1335            mach,
1336            reg->SrcRegisterDimInd.File,
1337            swizzle,
1338            &index2,
1339            &indir_index );
1340
1341         index.i[0] += (int) indir_index.f[0];
1342         index.i[1] += (int) indir_index.f[1];
1343         index.i[2] += (int) indir_index.f[2];
1344         index.i[3] += (int) indir_index.f[3];
1345
1346         /* for disabled execution channels, zero-out the index to
1347          * avoid using a potential garbage value.
1348          */
1349         for (i = 0; i < QUAD_SIZE; i++) {
1350            if ((execmask & (1 << i)) == 0)
1351               index.i[i] = 0;
1352         }
1353      }
1354
1355      /* If by any chance there was a need for a 3D array of register
1356       * files, we would have to check whether SrcRegisterDim is followed
1357       * by a dimension register and continue the saga.
1358       */
1359   }
1360
1361   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1362   fetch_src_file_channel(
1363      mach,
1364      reg->SrcRegister.File,
1365      swizzle,
1366      &index,
1367      chan );
1368
1369   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
1370   case TGSI_UTIL_SIGN_CLEAR:
1371      micro_abs( chan, chan );
1372      break;
1373
1374   case TGSI_UTIL_SIGN_SET:
1375      micro_abs( chan, chan );
1376      micro_neg( chan, chan );
1377      break;
1378
1379   case TGSI_UTIL_SIGN_TOGGLE:
1380      micro_neg( chan, chan );
1381      break;
1382
1383   case TGSI_UTIL_SIGN_KEEP:
1384      break;
1385   }
1386
1387   if (reg->SrcRegisterExtMod.Complement) {
1388      micro_sub( chan, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], chan );
1389   }
1390}
1391
1392static void
1393store_dest(
1394   struct tgsi_exec_machine *mach,
1395   const union tgsi_exec_channel *chan,
1396   const struct tgsi_full_dst_register *reg,
1397   const struct tgsi_full_instruction *inst,
1398   uint chan_index )
1399{
1400   uint i;
1401   union tgsi_exec_channel null;
1402   union tgsi_exec_channel *dst;
1403   uint execmask = mach->ExecMask;
1404   int offset = 0;  /* indirection offset */
1405   int index;
1406
1407#ifdef DEBUG
1408   check_inf_or_nan(chan);
1409#endif
1410
1411   /* There is an extra source register that indirectly subscripts
1412    * a register file. The direct index now becomes an offset
1413    * that is being added to the indirect register.
1414    *
1415    *    file[ind[2].x+1],
1416    *    where:
1417    *       ind = DstRegisterInd.File
1418    *       [2] = DstRegisterInd.Index
1419    *       .x = DstRegisterInd.SwizzleX
1420    */
1421   if (reg->DstRegister.Indirect) {
1422      union tgsi_exec_channel index;
1423      union tgsi_exec_channel indir_index;
1424      uint swizzle;
1425
1426      /* which address register (always zero for now) */
1427      index.i[0] =
1428      index.i[1] =
1429      index.i[2] =
1430      index.i[3] = reg->DstRegisterInd.Index;
1431
1432      /* get current value of address register[swizzle] */
1433      swizzle = tgsi_util_get_src_register_swizzle( &reg->DstRegisterInd, CHAN_X );
1434
1435      /* fetch values from the address/indirection register */
1436      fetch_src_file_channel(
1437         mach,
1438         reg->DstRegisterInd.File,
1439         swizzle,
1440         &index,
1441         &indir_index );
1442
1443      /* save indirection offset */
1444      offset = (int) indir_index.f[0];
1445   }
1446
1447   switch (reg->DstRegister.File) {
1448   case TGSI_FILE_NULL:
1449      dst = &null;
1450      break;
1451
1452   case TGSI_FILE_OUTPUT:
1453      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1454         + reg->DstRegister.Index;
1455      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1456      break;
1457
1458   case TGSI_FILE_TEMPORARY:
1459      index = reg->DstRegister.Index;
1460      assert( index < TGSI_EXEC_NUM_TEMPS );
1461      dst = &mach->Temps[offset + index].xyzw[chan_index];
1462      break;
1463
1464   case TGSI_FILE_ADDRESS:
1465      index = reg->DstRegister.Index;
1466      dst = &mach->Addrs[index].xyzw[chan_index];
1467      break;
1468
1469   default:
1470      assert( 0 );
1471      return;
1472   }
1473
1474   if (inst->InstructionExtNv.CondFlowEnable) {
1475      union tgsi_exec_channel *cc = &mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C];
1476      uint swizzle;
1477      uint shift;
1478      uint mask;
1479      uint test;
1480
1481      /* Only CC0 supported.
1482       */
1483      assert( inst->InstructionExtNv.CondFlowIndex < 1 );
1484
1485      switch (chan_index) {
1486      case CHAN_X:
1487         swizzle = inst->InstructionExtNv.CondSwizzleX;
1488         break;
1489      case CHAN_Y:
1490         swizzle = inst->InstructionExtNv.CondSwizzleY;
1491         break;
1492      case CHAN_Z:
1493         swizzle = inst->InstructionExtNv.CondSwizzleZ;
1494         break;
1495      case CHAN_W:
1496         swizzle = inst->InstructionExtNv.CondSwizzleW;
1497         break;
1498      default:
1499         assert( 0 );
1500         return;
1501      }
1502
1503      switch (swizzle) {
1504      case TGSI_SWIZZLE_X:
1505         shift = TGSI_EXEC_CC_X_SHIFT;
1506         mask = TGSI_EXEC_CC_X_MASK;
1507         break;
1508      case TGSI_SWIZZLE_Y:
1509         shift = TGSI_EXEC_CC_Y_SHIFT;
1510         mask = TGSI_EXEC_CC_Y_MASK;
1511         break;
1512      case TGSI_SWIZZLE_Z:
1513         shift = TGSI_EXEC_CC_Z_SHIFT;
1514         mask = TGSI_EXEC_CC_Z_MASK;
1515         break;
1516      case TGSI_SWIZZLE_W:
1517         shift = TGSI_EXEC_CC_W_SHIFT;
1518         mask = TGSI_EXEC_CC_W_MASK;
1519         break;
1520      default:
1521         assert( 0 );
1522         return;
1523      }
1524
1525      switch (inst->InstructionExtNv.CondMask) {
1526      case TGSI_CC_GT:
1527         test = ~(TGSI_EXEC_CC_GT << shift) & mask;
1528         for (i = 0; i < QUAD_SIZE; i++)
1529            if (cc->u[i] & test)
1530               execmask &= ~(1 << i);
1531         break;
1532
1533      case TGSI_CC_EQ:
1534         test = ~(TGSI_EXEC_CC_EQ << shift) & mask;
1535         for (i = 0; i < QUAD_SIZE; i++)
1536            if (cc->u[i] & test)
1537               execmask &= ~(1 << i);
1538         break;
1539
1540      case TGSI_CC_LT:
1541         test = ~(TGSI_EXEC_CC_LT << shift) & mask;
1542         for (i = 0; i < QUAD_SIZE; i++)
1543            if (cc->u[i] & test)
1544               execmask &= ~(1 << i);
1545         break;
1546
1547      case TGSI_CC_GE:
1548         test = ~((TGSI_EXEC_CC_GT | TGSI_EXEC_CC_EQ) << shift) & mask;
1549         for (i = 0; i < QUAD_SIZE; i++)
1550            if (cc->u[i] & test)
1551               execmask &= ~(1 << i);
1552         break;
1553
1554      case TGSI_CC_LE:
1555         test = ~((TGSI_EXEC_CC_LT | TGSI_EXEC_CC_EQ) << shift) & mask;
1556         for (i = 0; i < QUAD_SIZE; i++)
1557            if (cc->u[i] & test)
1558               execmask &= ~(1 << i);
1559         break;
1560
1561      case TGSI_CC_NE:
1562         test = ~((TGSI_EXEC_CC_GT | TGSI_EXEC_CC_LT | TGSI_EXEC_CC_UN) << shift) & mask;
1563         for (i = 0; i < QUAD_SIZE; i++)
1564            if (cc->u[i] & test)
1565               execmask &= ~(1 << i);
1566         break;
1567
1568      case TGSI_CC_TR:
1569         break;
1570
1571      case TGSI_CC_FL:
1572         for (i = 0; i < QUAD_SIZE; i++)
1573            execmask &= ~(1 << i);
1574         break;
1575
1576      default:
1577         assert( 0 );
1578         return;
1579      }
1580   }
1581
1582   switch (inst->Instruction.Saturate) {
1583   case TGSI_SAT_NONE:
1584      for (i = 0; i < QUAD_SIZE; i++)
1585         if (execmask & (1 << i))
1586            dst->i[i] = chan->i[i];
1587      break;
1588
1589   case TGSI_SAT_ZERO_ONE:
1590      for (i = 0; i < QUAD_SIZE; i++)
1591         if (execmask & (1 << i)) {
1592            if (chan->f[i] < 0.0f)
1593               dst->f[i] = 0.0f;
1594            else if (chan->f[i] > 1.0f)
1595               dst->f[i] = 1.0f;
1596            else
1597               dst->i[i] = chan->i[i];
1598         }
1599      break;
1600
1601   case TGSI_SAT_MINUS_PLUS_ONE:
1602      for (i = 0; i < QUAD_SIZE; i++)
1603         if (execmask & (1 << i)) {
1604            if (chan->f[i] < -1.0f)
1605               dst->f[i] = -1.0f;
1606            else if (chan->f[i] > 1.0f)
1607               dst->f[i] = 1.0f;
1608            else
1609               dst->i[i] = chan->i[i];
1610         }
1611      break;
1612
1613   default:
1614      assert( 0 );
1615   }
1616
1617   if (inst->InstructionExtNv.CondDstUpdate) {
1618      union tgsi_exec_channel *cc = &mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C];
1619      uint shift;
1620      uint mask;
1621
1622      /* Only CC0 supported.
1623       */
1624      assert( inst->InstructionExtNv.CondDstIndex < 1 );
1625
1626      switch (chan_index) {
1627      case CHAN_X:
1628         shift = TGSI_EXEC_CC_X_SHIFT;
1629         mask = ~TGSI_EXEC_CC_X_MASK;
1630         break;
1631      case CHAN_Y:
1632         shift = TGSI_EXEC_CC_Y_SHIFT;
1633         mask = ~TGSI_EXEC_CC_Y_MASK;
1634         break;
1635      case CHAN_Z:
1636         shift = TGSI_EXEC_CC_Z_SHIFT;
1637         mask = ~TGSI_EXEC_CC_Z_MASK;
1638         break;
1639      case CHAN_W:
1640         shift = TGSI_EXEC_CC_W_SHIFT;
1641         mask = ~TGSI_EXEC_CC_W_MASK;
1642         break;
1643      default:
1644         assert( 0 );
1645         return;
1646      }
1647
1648      for (i = 0; i < QUAD_SIZE; i++)
1649         if (execmask & (1 << i)) {
1650            cc->u[i] &= mask;
1651            if (dst->f[i] < 0.0f)
1652               cc->u[i] |= TGSI_EXEC_CC_LT << shift;
1653            else if (dst->f[i] > 0.0f)
1654               cc->u[i] |= TGSI_EXEC_CC_GT << shift;
1655            else if (dst->f[i] == 0.0f)
1656               cc->u[i] |= TGSI_EXEC_CC_EQ << shift;
1657            else
1658               cc->u[i] |= TGSI_EXEC_CC_UN << shift;
1659         }
1660   }
1661}
1662
1663#define FETCH(VAL,INDEX,CHAN)\
1664    fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
1665
1666#define STORE(VAL,INDEX,CHAN)\
1667    store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
1668
1669
1670/**
1671 * Execute ARB-style KIL which is predicated by a src register.
1672 * Kill fragment if any of the four values is less than zero.
1673 */
1674static void
1675exec_kil(struct tgsi_exec_machine *mach,
1676         const struct tgsi_full_instruction *inst)
1677{
1678   uint uniquemask;
1679   uint chan_index;
1680   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1681   union tgsi_exec_channel r[1];
1682
1683   /* This mask stores component bits that were already tested. */
1684   uniquemask = 0;
1685
1686   for (chan_index = 0; chan_index < 4; chan_index++)
1687   {
1688      uint swizzle;
1689      uint i;
1690
1691      /* unswizzle channel */
1692      swizzle = tgsi_util_get_full_src_register_swizzle (
1693                        &inst->FullSrcRegisters[0],
1694                        chan_index);
1695
1696      /* check if the component has not been already tested */
1697      if (uniquemask & (1 << swizzle))
1698         continue;
1699      uniquemask |= 1 << swizzle;
1700
1701      FETCH(&r[0], 0, chan_index);
1702      for (i = 0; i < 4; i++)
1703         if (r[0].f[i] < 0.0f)
1704            kilmask |= 1 << i;
1705   }
1706
1707   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1708}
1709
1710/**
1711 * Execute NVIDIA-style KIL which is predicated by a condition code.
1712 * Kill fragment if the condition code is TRUE.
1713 */
1714static void
1715exec_kilp(struct tgsi_exec_machine *mach,
1716          const struct tgsi_full_instruction *inst)
1717{
1718   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1719
1720   if (inst->InstructionExtNv.CondFlowEnable) {
1721      uint swizzle[4];
1722      uint chan_index;
1723
1724      kilmask = 0x0;
1725
1726      swizzle[0] = inst->InstructionExtNv.CondSwizzleX;
1727      swizzle[1] = inst->InstructionExtNv.CondSwizzleY;
1728      swizzle[2] = inst->InstructionExtNv.CondSwizzleZ;
1729      swizzle[3] = inst->InstructionExtNv.CondSwizzleW;
1730
1731      for (chan_index = 0; chan_index < 4; chan_index++)
1732      {
1733         uint i;
1734
1735         for (i = 0; i < 4; i++) {
1736            /* TODO: evaluate the condition code */
1737            if (0)
1738               kilmask |= 1 << i;
1739         }
1740      }
1741   }
1742   else {
1743      /* "unconditional" kil */
1744      kilmask = mach->ExecMask;
1745   }
1746   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1747}
1748
1749
1750/*
1751 * Fetch a four texture samples using STR texture coordinates.
1752 */
1753static void
1754fetch_texel( struct tgsi_sampler *sampler,
1755             const union tgsi_exec_channel *s,
1756             const union tgsi_exec_channel *t,
1757             const union tgsi_exec_channel *p,
1758             float lodbias,  /* XXX should be float[4] */
1759             union tgsi_exec_channel *r,
1760             union tgsi_exec_channel *g,
1761             union tgsi_exec_channel *b,
1762             union tgsi_exec_channel *a )
1763{
1764   uint j;
1765   float rgba[NUM_CHANNELS][QUAD_SIZE];
1766
1767   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
1768
1769   for (j = 0; j < 4; j++) {
1770      r->f[j] = rgba[0][j];
1771      g->f[j] = rgba[1][j];
1772      b->f[j] = rgba[2][j];
1773      a->f[j] = rgba[3][j];
1774   }
1775}
1776
1777
1778static void
1779exec_tex(struct tgsi_exec_machine *mach,
1780         const struct tgsi_full_instruction *inst,
1781         boolean biasLod,
1782         boolean projected)
1783{
1784   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
1785   union tgsi_exec_channel r[4];
1786   uint chan_index;
1787   float lodBias;
1788
1789   /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
1790
1791   switch (inst->InstructionExtTexture.Texture) {
1792   case TGSI_TEXTURE_1D:
1793   case TGSI_TEXTURE_SHADOW1D:
1794
1795      FETCH(&r[0], 0, CHAN_X);
1796
1797      if (projected) {
1798         FETCH(&r[1], 0, CHAN_W);
1799         micro_div( &r[0], &r[0], &r[1] );
1800      }
1801
1802      if (biasLod) {
1803         FETCH(&r[1], 0, CHAN_W);
1804         lodBias = r[2].f[0];
1805      }
1806      else
1807         lodBias = 0.0;
1808
1809      fetch_texel(mach->Samplers[unit],
1810                  &r[0], &ZeroVec, &ZeroVec, lodBias,  /* S, T, P, BIAS */
1811                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
1812      break;
1813
1814   case TGSI_TEXTURE_2D:
1815   case TGSI_TEXTURE_RECT:
1816   case TGSI_TEXTURE_SHADOW2D:
1817   case TGSI_TEXTURE_SHADOWRECT:
1818
1819      FETCH(&r[0], 0, CHAN_X);
1820      FETCH(&r[1], 0, CHAN_Y);
1821      FETCH(&r[2], 0, CHAN_Z);
1822
1823      if (projected) {
1824         FETCH(&r[3], 0, CHAN_W);
1825         micro_div( &r[0], &r[0], &r[3] );
1826         micro_div( &r[1], &r[1], &r[3] );
1827         micro_div( &r[2], &r[2], &r[3] );
1828      }
1829
1830      if (biasLod) {
1831         FETCH(&r[3], 0, CHAN_W);
1832         lodBias = r[3].f[0];
1833      }
1834      else
1835         lodBias = 0.0;
1836
1837      fetch_texel(mach->Samplers[unit],
1838                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
1839                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1840      break;
1841
1842   case TGSI_TEXTURE_3D:
1843   case TGSI_TEXTURE_CUBE:
1844
1845      FETCH(&r[0], 0, CHAN_X);
1846      FETCH(&r[1], 0, CHAN_Y);
1847      FETCH(&r[2], 0, CHAN_Z);
1848
1849      if (projected) {
1850         FETCH(&r[3], 0, CHAN_W);
1851         micro_div( &r[0], &r[0], &r[3] );
1852         micro_div( &r[1], &r[1], &r[3] );
1853         micro_div( &r[2], &r[2], &r[3] );
1854      }
1855
1856      if (biasLod) {
1857         FETCH(&r[3], 0, CHAN_W);
1858         lodBias = r[3].f[0];
1859      }
1860      else
1861         lodBias = 0.0;
1862
1863      fetch_texel(mach->Samplers[unit],
1864                  &r[0], &r[1], &r[2], lodBias,
1865                  &r[0], &r[1], &r[2], &r[3]);
1866      break;
1867
1868   default:
1869      assert (0);
1870   }
1871
1872   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1873      STORE( &r[chan_index], 0, chan_index );
1874   }
1875}
1876
1877
1878/**
1879 * Evaluate a constant-valued coefficient at the position of the
1880 * current quad.
1881 */
1882static void
1883eval_constant_coef(
1884   struct tgsi_exec_machine *mach,
1885   unsigned attrib,
1886   unsigned chan )
1887{
1888   unsigned i;
1889
1890   for( i = 0; i < QUAD_SIZE; i++ ) {
1891      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1892   }
1893}
1894
1895/**
1896 * Evaluate a linear-valued coefficient at the position of the
1897 * current quad.
1898 */
1899static void
1900eval_linear_coef(
1901   struct tgsi_exec_machine *mach,
1902   unsigned attrib,
1903   unsigned chan )
1904{
1905   const float x = mach->QuadPos.xyzw[0].f[0];
1906   const float y = mach->QuadPos.xyzw[1].f[0];
1907   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1908   const float dady = mach->InterpCoefs[attrib].dady[chan];
1909   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1910   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1911   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1912   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1913   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1914}
1915
1916/**
1917 * Evaluate a perspective-valued coefficient at the position of the
1918 * current quad.
1919 */
1920static void
1921eval_perspective_coef(
1922   struct tgsi_exec_machine *mach,
1923   unsigned attrib,
1924   unsigned chan )
1925{
1926   const float x = mach->QuadPos.xyzw[0].f[0];
1927   const float y = mach->QuadPos.xyzw[1].f[0];
1928   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1929   const float dady = mach->InterpCoefs[attrib].dady[chan];
1930   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1931   const float *w = mach->QuadPos.xyzw[3].f;
1932   /* divide by W here */
1933   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1934   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1935   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1936   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1937}
1938
1939
1940typedef void (* eval_coef_func)(
1941   struct tgsi_exec_machine *mach,
1942   unsigned attrib,
1943   unsigned chan );
1944
1945static void
1946exec_declaration(
1947   struct tgsi_exec_machine *mach,
1948   const struct tgsi_full_declaration *decl )
1949{
1950   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
1951      if( decl->Declaration.File == TGSI_FILE_INPUT ) {
1952         unsigned first, last, mask;
1953         eval_coef_func eval;
1954
1955         first = decl->DeclarationRange.First;
1956         last = decl->DeclarationRange.Last;
1957         mask = decl->Declaration.UsageMask;
1958
1959         switch( decl->Declaration.Interpolate ) {
1960         case TGSI_INTERPOLATE_CONSTANT:
1961            eval = eval_constant_coef;
1962            break;
1963
1964         case TGSI_INTERPOLATE_LINEAR:
1965            eval = eval_linear_coef;
1966            break;
1967
1968         case TGSI_INTERPOLATE_PERSPECTIVE:
1969            eval = eval_perspective_coef;
1970            break;
1971
1972         default:
1973            eval = NULL;
1974            assert( 0 );
1975         }
1976
1977         if( mask == TGSI_WRITEMASK_XYZW ) {
1978            unsigned i, j;
1979
1980            for( i = first; i <= last; i++ ) {
1981               for( j = 0; j < NUM_CHANNELS; j++ ) {
1982                  eval( mach, i, j );
1983               }
1984            }
1985         }
1986         else {
1987            unsigned i, j;
1988
1989            for( j = 0; j < NUM_CHANNELS; j++ ) {
1990               if( mask & (1 << j) ) {
1991                  for( i = first; i <= last; i++ ) {
1992                     eval( mach, i, j );
1993                  }
1994               }
1995            }
1996         }
1997      }
1998   }
1999}
2000
2001static void
2002exec_instruction(
2003   struct tgsi_exec_machine *mach,
2004   const struct tgsi_full_instruction *inst,
2005   int *pc )
2006{
2007   uint chan_index;
2008   union tgsi_exec_channel r[10];
2009
2010   (*pc)++;
2011
2012   switch (inst->Instruction.Opcode) {
2013   case TGSI_OPCODE_ARL:
2014   case TGSI_OPCODE_FLR:
2015      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2016         FETCH( &r[0], 0, chan_index );
2017         micro_flr( &r[0], &r[0] );
2018         STORE( &r[0], 0, chan_index );
2019      }
2020      break;
2021
2022   case TGSI_OPCODE_MOV:
2023      if (inst->Flags & SOA_DEPENDENCY_FLAG) {
2024         /* Do all fetches into temp regs, then do all stores to avoid
2025          * intermediate/accidental clobbering.  This could be done all the
2026          * time for MOV but for other instructions we'll need more temps...
2027          */
2028         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2029            FETCH( &r[chan_index], 0, chan_index );
2030         }
2031         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2032            STORE( &r[chan_index], 0, chan_index );
2033         }
2034      }
2035      else {
2036         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2037            FETCH( &r[0], 0, chan_index );
2038            STORE( &r[0], 0, chan_index );
2039         }
2040      }
2041      break;
2042
2043   case TGSI_OPCODE_LIT:
2044      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2045         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2046      }
2047
2048      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2049         FETCH( &r[0], 0, CHAN_X );
2050         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2051            micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2052            STORE( &r[0], 0, CHAN_Y );
2053         }
2054
2055         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2056            FETCH( &r[1], 0, CHAN_Y );
2057            micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2058
2059            FETCH( &r[2], 0, CHAN_W );
2060            micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
2061            micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
2062            micro_pow( &r[1], &r[1], &r[2] );
2063            micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2064            STORE( &r[0], 0, CHAN_Z );
2065         }
2066      }
2067
2068      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2069         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2070      }
2071      break;
2072
2073   case TGSI_OPCODE_RCP:
2074   /* TGSI_OPCODE_RECIP */
2075      FETCH( &r[0], 0, CHAN_X );
2076      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
2077      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2078         STORE( &r[0], 0, chan_index );
2079      }
2080      break;
2081
2082   case TGSI_OPCODE_RSQ:
2083   /* TGSI_OPCODE_RECIPSQRT */
2084      FETCH( &r[0], 0, CHAN_X );
2085      micro_abs( &r[0], &r[0] );
2086      micro_sqrt( &r[0], &r[0] );
2087      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
2088      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2089         STORE( &r[0], 0, chan_index );
2090      }
2091      break;
2092
2093   case TGSI_OPCODE_EXP:
2094      FETCH( &r[0], 0, CHAN_X );
2095      micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
2096      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2097         micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
2098         STORE( &r[2], 0, CHAN_X );        /* store r2 */
2099      }
2100      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2101         micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
2102         STORE( &r[2], 0, CHAN_Y );        /* store r2 */
2103      }
2104      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2105         micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
2106         STORE( &r[2], 0, CHAN_Z );        /* store r2 */
2107      }
2108      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2109         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2110      }
2111      break;
2112
2113   case TGSI_OPCODE_LOG:
2114      FETCH( &r[0], 0, CHAN_X );
2115      micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
2116      micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
2117      micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
2118      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2119         STORE( &r[0], 0, CHAN_X );
2120      }
2121      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2122         micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
2123         micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
2124         STORE( &r[0], 0, CHAN_Y );
2125      }
2126      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2127         STORE( &r[1], 0, CHAN_Z );
2128      }
2129      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2130         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2131      }
2132      break;
2133
2134   case TGSI_OPCODE_MUL:
2135      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
2136      {
2137         FETCH(&r[0], 0, chan_index);
2138         FETCH(&r[1], 1, chan_index);
2139
2140         micro_mul( &r[0], &r[0], &r[1] );
2141
2142         STORE(&r[0], 0, chan_index);
2143      }
2144      break;
2145
2146   case TGSI_OPCODE_ADD:
2147      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2148         FETCH( &r[0], 0, chan_index );
2149         FETCH( &r[1], 1, chan_index );
2150         micro_add( &r[0], &r[0], &r[1] );
2151         STORE( &r[0], 0, chan_index );
2152      }
2153      break;
2154
2155   case TGSI_OPCODE_DP3:
2156   /* TGSI_OPCODE_DOT3 */
2157      FETCH( &r[0], 0, CHAN_X );
2158      FETCH( &r[1], 1, CHAN_X );
2159      micro_mul( &r[0], &r[0], &r[1] );
2160
2161      FETCH( &r[1], 0, CHAN_Y );
2162      FETCH( &r[2], 1, CHAN_Y );
2163      micro_mul( &r[1], &r[1], &r[2] );
2164      micro_add( &r[0], &r[0], &r[1] );
2165
2166      FETCH( &r[1], 0, CHAN_Z );
2167      FETCH( &r[2], 1, CHAN_Z );
2168      micro_mul( &r[1], &r[1], &r[2] );
2169      micro_add( &r[0], &r[0], &r[1] );
2170
2171      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2172         STORE( &r[0], 0, chan_index );
2173      }
2174      break;
2175
2176    case TGSI_OPCODE_DP4:
2177    /* TGSI_OPCODE_DOT4 */
2178       FETCH(&r[0], 0, CHAN_X);
2179       FETCH(&r[1], 1, CHAN_X);
2180
2181       micro_mul( &r[0], &r[0], &r[1] );
2182
2183       FETCH(&r[1], 0, CHAN_Y);
2184       FETCH(&r[2], 1, CHAN_Y);
2185
2186       micro_mul( &r[1], &r[1], &r[2] );
2187       micro_add( &r[0], &r[0], &r[1] );
2188
2189       FETCH(&r[1], 0, CHAN_Z);
2190       FETCH(&r[2], 1, CHAN_Z);
2191
2192       micro_mul( &r[1], &r[1], &r[2] );
2193       micro_add( &r[0], &r[0], &r[1] );
2194
2195       FETCH(&r[1], 0, CHAN_W);
2196       FETCH(&r[2], 1, CHAN_W);
2197
2198       micro_mul( &r[1], &r[1], &r[2] );
2199       micro_add( &r[0], &r[0], &r[1] );
2200
2201      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2202         STORE( &r[0], 0, chan_index );
2203      }
2204      break;
2205
2206   case TGSI_OPCODE_DST:
2207      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2208         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2209      }
2210
2211      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2212         FETCH( &r[0], 0, CHAN_Y );
2213         FETCH( &r[1], 1, CHAN_Y);
2214         micro_mul( &r[0], &r[0], &r[1] );
2215         STORE( &r[0], 0, CHAN_Y );
2216      }
2217
2218      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2219         FETCH( &r[0], 0, CHAN_Z );
2220         STORE( &r[0], 0, CHAN_Z );
2221      }
2222
2223      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2224         FETCH( &r[0], 1, CHAN_W );
2225         STORE( &r[0], 0, CHAN_W );
2226      }
2227      break;
2228
2229   case TGSI_OPCODE_MIN:
2230      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2231         FETCH(&r[0], 0, chan_index);
2232         FETCH(&r[1], 1, chan_index);
2233
2234         /* XXX use micro_min()?? */
2235         micro_lt( &r[0], &r[0], &r[1], &r[0], &r[1] );
2236
2237         STORE(&r[0], 0, chan_index);
2238      }
2239      break;
2240
2241   case TGSI_OPCODE_MAX:
2242      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2243         FETCH(&r[0], 0, chan_index);
2244         FETCH(&r[1], 1, chan_index);
2245
2246         /* XXX use micro_max()?? */
2247         micro_lt( &r[0], &r[0], &r[1], &r[1], &r[0] );
2248
2249         STORE(&r[0], 0, chan_index );
2250      }
2251      break;
2252
2253   case TGSI_OPCODE_SLT:
2254   /* TGSI_OPCODE_SETLT */
2255      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2256         FETCH( &r[0], 0, chan_index );
2257         FETCH( &r[1], 1, chan_index );
2258         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2259         STORE( &r[0], 0, chan_index );
2260      }
2261      break;
2262
2263   case TGSI_OPCODE_SGE:
2264   /* TGSI_OPCODE_SETGE */
2265      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2266         FETCH( &r[0], 0, chan_index );
2267         FETCH( &r[1], 1, chan_index );
2268         micro_le( &r[0], &r[1], &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2269         STORE( &r[0], 0, chan_index );
2270      }
2271      break;
2272
2273   case TGSI_OPCODE_MAD:
2274   /* TGSI_OPCODE_MADD */
2275      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2276         FETCH( &r[0], 0, chan_index );
2277         FETCH( &r[1], 1, chan_index );
2278         micro_mul( &r[0], &r[0], &r[1] );
2279         FETCH( &r[1], 2, chan_index );
2280         micro_add( &r[0], &r[0], &r[1] );
2281         STORE( &r[0], 0, chan_index );
2282      }
2283      break;
2284
2285   case TGSI_OPCODE_SUB:
2286      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2287         FETCH(&r[0], 0, chan_index);
2288         FETCH(&r[1], 1, chan_index);
2289
2290         micro_sub( &r[0], &r[0], &r[1] );
2291
2292         STORE(&r[0], 0, chan_index);
2293      }
2294      break;
2295
2296   case TGSI_OPCODE_LRP:
2297      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2298         FETCH(&r[0], 0, chan_index);
2299         FETCH(&r[1], 1, chan_index);
2300         FETCH(&r[2], 2, chan_index);
2301
2302         micro_sub( &r[1], &r[1], &r[2] );
2303         micro_mul( &r[0], &r[0], &r[1] );
2304         micro_add( &r[0], &r[0], &r[2] );
2305
2306         STORE(&r[0], 0, chan_index);
2307      }
2308      break;
2309
2310   case TGSI_OPCODE_CND:
2311      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2312         FETCH(&r[0], 0, chan_index);
2313         FETCH(&r[1], 1, chan_index);
2314         FETCH(&r[2], 2, chan_index);
2315         micro_lt(&r[0], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]);
2316         STORE(&r[0], 0, chan_index);
2317      }
2318      break;
2319
2320   case TGSI_OPCODE_DP2A:
2321      FETCH( &r[0], 0, CHAN_X );
2322      FETCH( &r[1], 1, CHAN_X );
2323      micro_mul( &r[0], &r[0], &r[1] );
2324
2325      FETCH( &r[1], 0, CHAN_Y );
2326      FETCH( &r[2], 1, CHAN_Y );
2327      micro_mul( &r[1], &r[1], &r[2] );
2328      micro_add( &r[0], &r[0], &r[1] );
2329
2330      FETCH( &r[2], 2, CHAN_X );
2331      micro_add( &r[0], &r[0], &r[2] );
2332
2333      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2334         STORE( &r[0], 0, chan_index );
2335      }
2336      break;
2337
2338   case TGSI_OPCODE_FRC:
2339      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2340         FETCH( &r[0], 0, chan_index );
2341         micro_frc( &r[0], &r[0] );
2342         STORE( &r[0], 0, chan_index );
2343      }
2344      break;
2345
2346   case TGSI_OPCODE_CLAMP:
2347      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2348         FETCH(&r[0], 0, chan_index);
2349         FETCH(&r[1], 1, chan_index);
2350         micro_max(&r[0], &r[0], &r[1]);
2351         FETCH(&r[1], 2, chan_index);
2352         micro_min(&r[0], &r[0], &r[1]);
2353         STORE(&r[0], 0, chan_index);
2354      }
2355      break;
2356
2357   case TGSI_OPCODE_ROUND:
2358   case TGSI_OPCODE_ARR:
2359      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2360         FETCH( &r[0], 0, chan_index );
2361         micro_rnd( &r[0], &r[0] );
2362         STORE( &r[0], 0, chan_index );
2363      }
2364      break;
2365
2366   case TGSI_OPCODE_EX2:
2367      FETCH(&r[0], 0, CHAN_X);
2368
2369#if FAST_MATH
2370      micro_exp2( &r[0], &r[0] );
2371#else
2372      micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] );
2373#endif
2374
2375      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2376         STORE( &r[0], 0, chan_index );
2377      }
2378      break;
2379
2380   case TGSI_OPCODE_LG2:
2381      FETCH( &r[0], 0, CHAN_X );
2382      micro_lg2( &r[0], &r[0] );
2383      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2384         STORE( &r[0], 0, chan_index );
2385      }
2386      break;
2387
2388   case TGSI_OPCODE_POW:
2389      FETCH(&r[0], 0, CHAN_X);
2390      FETCH(&r[1], 1, CHAN_X);
2391
2392      micro_pow( &r[0], &r[0], &r[1] );
2393
2394      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2395         STORE( &r[0], 0, chan_index );
2396      }
2397      break;
2398
2399   case TGSI_OPCODE_XPD:
2400      FETCH(&r[0], 0, CHAN_Y);
2401      FETCH(&r[1], 1, CHAN_Z);
2402
2403      micro_mul( &r[2], &r[0], &r[1] );
2404
2405      FETCH(&r[3], 0, CHAN_Z);
2406      FETCH(&r[4], 1, CHAN_Y);
2407
2408      micro_mul( &r[5], &r[3], &r[4] );
2409      micro_sub( &r[2], &r[2], &r[5] );
2410
2411      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2412         STORE( &r[2], 0, CHAN_X );
2413      }
2414
2415      FETCH(&r[2], 1, CHAN_X);
2416
2417      micro_mul( &r[3], &r[3], &r[2] );
2418
2419      FETCH(&r[5], 0, CHAN_X);
2420
2421      micro_mul( &r[1], &r[1], &r[5] );
2422      micro_sub( &r[3], &r[3], &r[1] );
2423
2424      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2425         STORE( &r[3], 0, CHAN_Y );
2426      }
2427
2428      micro_mul( &r[5], &r[5], &r[4] );
2429      micro_mul( &r[0], &r[0], &r[2] );
2430      micro_sub( &r[5], &r[5], &r[0] );
2431
2432      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2433         STORE( &r[5], 0, CHAN_Z );
2434      }
2435
2436      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2437         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2438      }
2439      break;
2440
2441    case TGSI_OPCODE_ABS:
2442       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2443          FETCH(&r[0], 0, chan_index);
2444
2445          micro_abs( &r[0], &r[0] );
2446
2447          STORE(&r[0], 0, chan_index);
2448       }
2449       break;
2450
2451   case TGSI_OPCODE_RCC:
2452      FETCH(&r[0], 0, CHAN_X);
2453      micro_div(&r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0]);
2454      micro_float_clamp(&r[0], &r[0]);
2455      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2456         STORE(&r[0], 0, chan_index);
2457      }
2458      break;
2459
2460   case TGSI_OPCODE_DPH:
2461      FETCH(&r[0], 0, CHAN_X);
2462      FETCH(&r[1], 1, CHAN_X);
2463
2464      micro_mul( &r[0], &r[0], &r[1] );
2465
2466      FETCH(&r[1], 0, CHAN_Y);
2467      FETCH(&r[2], 1, CHAN_Y);
2468
2469      micro_mul( &r[1], &r[1], &r[2] );
2470      micro_add( &r[0], &r[0], &r[1] );
2471
2472      FETCH(&r[1], 0, CHAN_Z);
2473      FETCH(&r[2], 1, CHAN_Z);
2474
2475      micro_mul( &r[1], &r[1], &r[2] );
2476      micro_add( &r[0], &r[0], &r[1] );
2477
2478      FETCH(&r[1], 1, CHAN_W);
2479
2480      micro_add( &r[0], &r[0], &r[1] );
2481
2482      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2483         STORE( &r[0], 0, chan_index );
2484      }
2485      break;
2486
2487   case TGSI_OPCODE_COS:
2488      FETCH(&r[0], 0, CHAN_X);
2489
2490      micro_cos( &r[0], &r[0] );
2491
2492      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2493         STORE( &r[0], 0, chan_index );
2494      }
2495      break;
2496
2497   case TGSI_OPCODE_DDX:
2498      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2499         FETCH( &r[0], 0, chan_index );
2500         micro_ddx( &r[0], &r[0] );
2501         STORE( &r[0], 0, chan_index );
2502      }
2503      break;
2504
2505   case TGSI_OPCODE_DDY:
2506      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2507         FETCH( &r[0], 0, chan_index );
2508         micro_ddy( &r[0], &r[0] );
2509         STORE( &r[0], 0, chan_index );
2510      }
2511      break;
2512
2513   case TGSI_OPCODE_KILP:
2514      exec_kilp (mach, inst);
2515      break;
2516
2517   case TGSI_OPCODE_KIL:
2518      exec_kil (mach, inst);
2519      break;
2520
2521   case TGSI_OPCODE_PK2H:
2522      assert (0);
2523      break;
2524
2525   case TGSI_OPCODE_PK2US:
2526      assert (0);
2527      break;
2528
2529   case TGSI_OPCODE_PK4B:
2530      assert (0);
2531      break;
2532
2533   case TGSI_OPCODE_PK4UB:
2534      assert (0);
2535      break;
2536
2537   case TGSI_OPCODE_RFL:
2538      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2539          IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2540          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2541         /* r0 = dp3(src0, src0) */
2542         FETCH(&r[2], 0, CHAN_X);
2543         micro_mul(&r[0], &r[2], &r[2]);
2544         FETCH(&r[4], 0, CHAN_Y);
2545         micro_mul(&r[8], &r[4], &r[4]);
2546         micro_add(&r[0], &r[0], &r[8]);
2547         FETCH(&r[6], 0, CHAN_Z);
2548         micro_mul(&r[8], &r[6], &r[6]);
2549         micro_add(&r[0], &r[0], &r[8]);
2550
2551         /* r1 = dp3(src0, src1) */
2552         FETCH(&r[3], 1, CHAN_X);
2553         micro_mul(&r[1], &r[2], &r[3]);
2554         FETCH(&r[5], 1, CHAN_Y);
2555         micro_mul(&r[8], &r[4], &r[5]);
2556         micro_add(&r[1], &r[1], &r[8]);
2557         FETCH(&r[7], 1, CHAN_Z);
2558         micro_mul(&r[8], &r[6], &r[7]);
2559         micro_add(&r[1], &r[1], &r[8]);
2560
2561         /* r1 = 2 * r1 / r0 */
2562         micro_add(&r[1], &r[1], &r[1]);
2563         micro_div(&r[1], &r[1], &r[0]);
2564
2565         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2566            micro_mul(&r[2], &r[2], &r[1]);
2567            micro_sub(&r[2], &r[2], &r[3]);
2568            STORE(&r[2], 0, CHAN_X);
2569         }
2570         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2571            micro_mul(&r[4], &r[4], &r[1]);
2572            micro_sub(&r[4], &r[4], &r[5]);
2573            STORE(&r[4], 0, CHAN_Y);
2574         }
2575         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2576            micro_mul(&r[6], &r[6], &r[1]);
2577            micro_sub(&r[6], &r[6], &r[7]);
2578            STORE(&r[6], 0, CHAN_Z);
2579         }
2580      }
2581      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2582         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2583      }
2584      break;
2585
2586   case TGSI_OPCODE_SEQ:
2587      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2588         FETCH( &r[0], 0, chan_index );
2589         FETCH( &r[1], 1, chan_index );
2590         micro_eq( &r[0], &r[0], &r[1],
2591                   &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C],
2592                   &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2593         STORE( &r[0], 0, chan_index );
2594      }
2595      break;
2596
2597   case TGSI_OPCODE_SFL:
2598      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2599         STORE(&mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, chan_index);
2600      }
2601      break;
2602
2603   case TGSI_OPCODE_SGT:
2604      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2605         FETCH( &r[0], 0, chan_index );
2606         FETCH( &r[1], 1, chan_index );
2607         micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
2608         STORE( &r[0], 0, chan_index );
2609      }
2610      break;
2611
2612   case TGSI_OPCODE_SIN:
2613      FETCH( &r[0], 0, CHAN_X );
2614      micro_sin( &r[0], &r[0] );
2615      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2616         STORE( &r[0], 0, chan_index );
2617      }
2618      break;
2619
2620   case TGSI_OPCODE_SLE:
2621      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2622         FETCH( &r[0], 0, chan_index );
2623         FETCH( &r[1], 1, chan_index );
2624         micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2625         STORE( &r[0], 0, chan_index );
2626      }
2627      break;
2628
2629   case TGSI_OPCODE_SNE:
2630      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2631         FETCH( &r[0], 0, chan_index );
2632         FETCH( &r[1], 1, chan_index );
2633         micro_eq( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
2634         STORE( &r[0], 0, chan_index );
2635      }
2636      break;
2637
2638   case TGSI_OPCODE_STR:
2639      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2640         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, chan_index);
2641      }
2642      break;
2643
2644   case TGSI_OPCODE_TEX:
2645      /* simple texture lookup */
2646      /* src[0] = texcoord */
2647      /* src[1] = sampler unit */
2648      exec_tex(mach, inst, FALSE, FALSE);
2649      break;
2650
2651   case TGSI_OPCODE_TXB:
2652      /* Texture lookup with lod bias */
2653      /* src[0] = texcoord (src[0].w = LOD bias) */
2654      /* src[1] = sampler unit */
2655      exec_tex(mach, inst, TRUE, FALSE);
2656      break;
2657
2658   case TGSI_OPCODE_TXD:
2659      /* Texture lookup with explict partial derivatives */
2660      /* src[0] = texcoord */
2661      /* src[1] = d[strq]/dx */
2662      /* src[2] = d[strq]/dy */
2663      /* src[3] = sampler unit */
2664      assert (0);
2665      break;
2666
2667   case TGSI_OPCODE_TXL:
2668      /* Texture lookup with explit LOD */
2669      /* src[0] = texcoord (src[0].w = LOD) */
2670      /* src[1] = sampler unit */
2671      exec_tex(mach, inst, TRUE, FALSE);
2672      break;
2673
2674   case TGSI_OPCODE_TXP:
2675      /* Texture lookup with projection */
2676      /* src[0] = texcoord (src[0].w = projection) */
2677      /* src[1] = sampler unit */
2678      exec_tex(mach, inst, FALSE, TRUE);
2679      break;
2680
2681   case TGSI_OPCODE_UP2H:
2682      assert (0);
2683      break;
2684
2685   case TGSI_OPCODE_UP2US:
2686      assert (0);
2687      break;
2688
2689   case TGSI_OPCODE_UP4B:
2690      assert (0);
2691      break;
2692
2693   case TGSI_OPCODE_UP4UB:
2694      assert (0);
2695      break;
2696
2697   case TGSI_OPCODE_X2D:
2698      FETCH(&r[0], 1, CHAN_X);
2699      FETCH(&r[1], 1, CHAN_Y);
2700      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2701          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2702         FETCH(&r[2], 2, CHAN_X);
2703         micro_mul(&r[2], &r[2], &r[0]);
2704         FETCH(&r[3], 2, CHAN_Y);
2705         micro_mul(&r[3], &r[3], &r[1]);
2706         micro_add(&r[2], &r[2], &r[3]);
2707         FETCH(&r[3], 0, CHAN_X);
2708         micro_add(&r[2], &r[2], &r[3]);
2709         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2710            STORE(&r[2], 0, CHAN_X);
2711         }
2712         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2713            STORE(&r[2], 0, CHAN_Z);
2714         }
2715      }
2716      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2717          IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2718         FETCH(&r[2], 2, CHAN_Z);
2719         micro_mul(&r[2], &r[2], &r[0]);
2720         FETCH(&r[3], 2, CHAN_W);
2721         micro_mul(&r[3], &r[3], &r[1]);
2722         micro_add(&r[2], &r[2], &r[3]);
2723         FETCH(&r[3], 0, CHAN_Y);
2724         micro_add(&r[2], &r[2], &r[3]);
2725         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2726            STORE(&r[2], 0, CHAN_Y);
2727         }
2728         if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2729            STORE(&r[2], 0, CHAN_W);
2730         }
2731      }
2732      break;
2733
2734   case TGSI_OPCODE_ARA:
2735      assert (0);
2736      break;
2737
2738   case TGSI_OPCODE_BRA:
2739      assert (0);
2740      break;
2741
2742   case TGSI_OPCODE_CAL:
2743      /* skip the call if no execution channels are enabled */
2744      if (mach->ExecMask) {
2745         /* do the call */
2746
2747         /* First, record the depths of the execution stacks.
2748          * This is important for deeply nested/looped return statements.
2749          * We have to unwind the stacks by the correct amount.  For a
2750          * real code generator, we could determine the number of entries
2751          * to pop off each stack with simple static analysis and avoid
2752          * implementing this data structure at run time.
2753          */
2754         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
2755         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
2756         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
2757         /* note that PC was already incremented above */
2758         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
2759
2760         mach->CallStackTop++;
2761
2762         /* Second, push the Cond, Loop, Cont, Func stacks */
2763         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2764         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2765         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2766         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2767         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2768         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2769         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
2770         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
2771
2772         /* Finally, jump to the subroutine */
2773         *pc = inst->InstructionExtLabel.Label;
2774      }
2775      break;
2776
2777   case TGSI_OPCODE_RET:
2778      mach->FuncMask &= ~mach->ExecMask;
2779      UPDATE_EXEC_MASK(mach);
2780
2781      if (mach->FuncMask == 0x0) {
2782         /* really return now (otherwise, keep executing */
2783
2784         if (mach->CallStackTop == 0) {
2785            /* returning from main() */
2786            *pc = -1;
2787            return;
2788         }
2789
2790         assert(mach->CallStackTop > 0);
2791         mach->CallStackTop--;
2792
2793         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
2794         mach->CondMask = mach->CondStack[mach->CondStackTop];
2795
2796         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
2797         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
2798
2799         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
2800         mach->ContMask = mach->ContStack[mach->ContStackTop];
2801
2802         assert(mach->FuncStackTop > 0);
2803         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
2804
2805         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
2806
2807         UPDATE_EXEC_MASK(mach);
2808      }
2809      break;
2810
2811   case TGSI_OPCODE_SSG:
2812   /* TGSI_OPCODE_SGN */
2813      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2814         FETCH( &r[0], 0, chan_index );
2815         micro_sgn( &r[0], &r[0] );
2816         STORE( &r[0], 0, chan_index );
2817      }
2818      break;
2819
2820   case TGSI_OPCODE_CMP:
2821      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2822         FETCH(&r[0], 0, chan_index);
2823         FETCH(&r[1], 1, chan_index);
2824         FETCH(&r[2], 2, chan_index);
2825
2826         micro_lt( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2] );
2827
2828         STORE(&r[0], 0, chan_index);
2829      }
2830      break;
2831
2832   case TGSI_OPCODE_SCS:
2833      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2834         FETCH( &r[0], 0, CHAN_X );
2835         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2836            micro_cos(&r[1], &r[0]);
2837            STORE(&r[1], 0, CHAN_X);
2838         }
2839         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2840            micro_sin(&r[1], &r[0]);
2841            STORE(&r[1], 0, CHAN_Y);
2842         }
2843      }
2844      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2845         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
2846      }
2847      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
2848         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2849      }
2850      break;
2851
2852   case TGSI_OPCODE_NRM:
2853      /* 3-component vector normalize */
2854      if(IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2855         IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2856         IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2857         /* r3 = sqrt(dp3(src0, src0)) */
2858         FETCH(&r[0], 0, CHAN_X);
2859         micro_mul(&r[3], &r[0], &r[0]);
2860         FETCH(&r[1], 0, CHAN_Y);
2861         micro_mul(&r[4], &r[1], &r[1]);
2862         micro_add(&r[3], &r[3], &r[4]);
2863         FETCH(&r[2], 0, CHAN_Z);
2864         micro_mul(&r[4], &r[2], &r[2]);
2865         micro_add(&r[3], &r[3], &r[4]);
2866         micro_sqrt(&r[3], &r[3]);
2867
2868         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2869            micro_div(&r[0], &r[0], &r[3]);
2870            STORE(&r[0], 0, CHAN_X);
2871         }
2872         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2873            micro_div(&r[1], &r[1], &r[3]);
2874            STORE(&r[1], 0, CHAN_Y);
2875         }
2876         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2877            micro_div(&r[2], &r[2], &r[3]);
2878            STORE(&r[2], 0, CHAN_Z);
2879         }
2880      }
2881      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2882         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2883      }
2884      break;
2885
2886   case TGSI_OPCODE_NRM4:
2887      /* 4-component vector normalize */
2888      {
2889         union tgsi_exec_channel tmp, dot;
2890
2891         /* tmp = dp4(src0, src0): */
2892         FETCH( &r[0], 0, CHAN_X );
2893         micro_mul( &tmp, &r[0], &r[0] );
2894
2895         FETCH( &r[1], 0, CHAN_Y );
2896         micro_mul( &dot, &r[1], &r[1] );
2897         micro_add( &tmp, &tmp, &dot );
2898
2899         FETCH( &r[2], 0, CHAN_Z );
2900         micro_mul( &dot, &r[2], &r[2] );
2901         micro_add( &tmp, &tmp, &dot );
2902
2903         FETCH( &r[3], 0, CHAN_W );
2904         micro_mul( &dot, &r[3], &r[3] );
2905         micro_add( &tmp, &tmp, &dot );
2906
2907         /* tmp = 1 / sqrt(tmp) */
2908         micro_sqrt( &tmp, &tmp );
2909         micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
2910
2911         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2912            /* chan = chan * tmp */
2913            micro_mul( &r[chan_index], &tmp, &r[chan_index] );
2914            STORE( &r[chan_index], 0, chan_index );
2915         }
2916      }
2917      break;
2918
2919   case TGSI_OPCODE_DIV:
2920      assert( 0 );
2921      break;
2922
2923   case TGSI_OPCODE_DP2:
2924      FETCH( &r[0], 0, CHAN_X );
2925      FETCH( &r[1], 1, CHAN_X );
2926      micro_mul( &r[0], &r[0], &r[1] );
2927
2928      FETCH( &r[1], 0, CHAN_Y );
2929      FETCH( &r[2], 1, CHAN_Y );
2930      micro_mul( &r[1], &r[1], &r[2] );
2931      micro_add( &r[0], &r[0], &r[1] );
2932
2933      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2934         STORE( &r[0], 0, chan_index );
2935      }
2936      break;
2937
2938   case TGSI_OPCODE_IF:
2939      /* push CondMask */
2940      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2941      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2942      FETCH( &r[0], 0, CHAN_X );
2943      /* update CondMask */
2944      if( ! r[0].u[0] ) {
2945         mach->CondMask &= ~0x1;
2946      }
2947      if( ! r[0].u[1] ) {
2948         mach->CondMask &= ~0x2;
2949      }
2950      if( ! r[0].u[2] ) {
2951         mach->CondMask &= ~0x4;
2952      }
2953      if( ! r[0].u[3] ) {
2954         mach->CondMask &= ~0x8;
2955      }
2956      UPDATE_EXEC_MASK(mach);
2957      /* Todo: If CondMask==0, jump to ELSE */
2958      break;
2959
2960   case TGSI_OPCODE_ELSE:
2961      /* invert CondMask wrt previous mask */
2962      {
2963         uint prevMask;
2964         assert(mach->CondStackTop > 0);
2965         prevMask = mach->CondStack[mach->CondStackTop - 1];
2966         mach->CondMask = ~mach->CondMask & prevMask;
2967         UPDATE_EXEC_MASK(mach);
2968         /* Todo: If CondMask==0, jump to ENDIF */
2969      }
2970      break;
2971
2972   case TGSI_OPCODE_ENDIF:
2973      /* pop CondMask */
2974      assert(mach->CondStackTop > 0);
2975      mach->CondMask = mach->CondStack[--mach->CondStackTop];
2976      UPDATE_EXEC_MASK(mach);
2977      break;
2978
2979   case TGSI_OPCODE_END:
2980      /* halt execution */
2981      *pc = -1;
2982      break;
2983
2984   case TGSI_OPCODE_REP:
2985      assert (0);
2986      break;
2987
2988   case TGSI_OPCODE_ENDREP:
2989       assert (0);
2990       break;
2991
2992   case TGSI_OPCODE_PUSHA:
2993      assert (0);
2994      break;
2995
2996   case TGSI_OPCODE_POPA:
2997      assert (0);
2998      break;
2999
3000   case TGSI_OPCODE_CEIL:
3001      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3002         FETCH( &r[0], 0, chan_index );
3003         micro_ceil( &r[0], &r[0] );
3004         STORE( &r[0], 0, chan_index );
3005      }
3006      break;
3007
3008   case TGSI_OPCODE_I2F:
3009      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3010         FETCH( &r[0], 0, chan_index );
3011         micro_i2f( &r[0], &r[0] );
3012         STORE( &r[0], 0, chan_index );
3013      }
3014      break;
3015
3016   case TGSI_OPCODE_NOT:
3017      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3018         FETCH( &r[0], 0, chan_index );
3019         micro_not( &r[0], &r[0] );
3020         STORE( &r[0], 0, chan_index );
3021      }
3022      break;
3023
3024   case TGSI_OPCODE_TRUNC:
3025      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3026         FETCH( &r[0], 0, chan_index );
3027         micro_trunc( &r[0], &r[0] );
3028         STORE( &r[0], 0, chan_index );
3029      }
3030      break;
3031
3032   case TGSI_OPCODE_SHL:
3033      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3034         FETCH( &r[0], 0, chan_index );
3035         FETCH( &r[1], 1, chan_index );
3036         micro_shl( &r[0], &r[0], &r[1] );
3037         STORE( &r[0], 0, chan_index );
3038      }
3039      break;
3040
3041   case TGSI_OPCODE_SHR:
3042      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3043         FETCH( &r[0], 0, chan_index );
3044         FETCH( &r[1], 1, chan_index );
3045         micro_ishr( &r[0], &r[0], &r[1] );
3046         STORE( &r[0], 0, chan_index );
3047      }
3048      break;
3049
3050   case TGSI_OPCODE_AND:
3051      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3052         FETCH( &r[0], 0, chan_index );
3053         FETCH( &r[1], 1, chan_index );
3054         micro_and( &r[0], &r[0], &r[1] );
3055         STORE( &r[0], 0, chan_index );
3056      }
3057      break;
3058
3059   case TGSI_OPCODE_OR:
3060      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3061         FETCH( &r[0], 0, chan_index );
3062         FETCH( &r[1], 1, chan_index );
3063         micro_or( &r[0], &r[0], &r[1] );
3064         STORE( &r[0], 0, chan_index );
3065      }
3066      break;
3067
3068   case TGSI_OPCODE_MOD:
3069      assert (0);
3070      break;
3071
3072   case TGSI_OPCODE_XOR:
3073      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
3074         FETCH( &r[0], 0, chan_index );
3075         FETCH( &r[1], 1, chan_index );
3076         micro_xor( &r[0], &r[0], &r[1] );
3077         STORE( &r[0], 0, chan_index );
3078      }
3079      break;
3080
3081   case TGSI_OPCODE_SAD:
3082      assert (0);
3083      break;
3084
3085   case TGSI_OPCODE_TXF:
3086      assert (0);
3087      break;
3088
3089   case TGSI_OPCODE_TXQ:
3090      assert (0);
3091      break;
3092
3093   case TGSI_OPCODE_EMIT:
3094      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
3095      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
3096      break;
3097
3098   case TGSI_OPCODE_ENDPRIM:
3099      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
3100      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
3101      break;
3102
3103   case TGSI_OPCODE_BGNFOR:
3104      assert(mach->LoopCounterStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3105      for (chan_index = 0; chan_index < 3; chan_index++) {
3106         FETCH( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[chan_index], 0, chan_index );
3107      }
3108      STORE( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_Y], 0, CHAN_X );
3109      ++mach->LoopCounterStackTop;
3110      /* fall-through (for now) */
3111   case TGSI_OPCODE_BGNLOOP:
3112      /* push LoopMask and ContMasks */
3113      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3114      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3115      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3116      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3117      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3118      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3119      break;
3120
3121   case TGSI_OPCODE_ENDFOR:
3122      assert(mach->LoopCounterStackTop > 0);
3123      micro_sub( &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3124                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3125                 &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
3126      /* update LoopMask */
3127      if( mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X].f[0] <= 0) {
3128         mach->LoopMask &= ~0x1;
3129      }
3130      if( mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X].f[1] <= 0 ) {
3131         mach->LoopMask &= ~0x2;
3132      }
3133      if( mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X].f[2] <= 0 ) {
3134         mach->LoopMask &= ~0x4;
3135      }
3136      if( mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X].f[3] <= 0 ) {
3137         mach->LoopMask &= ~0x8;
3138      }
3139      micro_add( &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3140                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3141                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Z]);
3142      assert(mach->LoopLabelStackTop > 0);
3143      inst = mach->Instructions + mach->LoopLabelStack[mach->LoopLabelStackTop - 1];
3144      STORE( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_Y], 0, CHAN_X );
3145      /* Restore ContMask, but don't pop */
3146      assert(mach->ContStackTop > 0);
3147      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3148      UPDATE_EXEC_MASK(mach);
3149      if (mach->ExecMask) {
3150         /* repeat loop: jump to instruction just past BGNLOOP */
3151         assert(mach->LoopLabelStackTop > 0);
3152         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3153      }
3154      else {
3155         /* exit loop: pop LoopMask */
3156         assert(mach->LoopStackTop > 0);
3157         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3158         /* pop ContMask */
3159         assert(mach->ContStackTop > 0);
3160         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3161         assert(mach->LoopLabelStackTop > 0);
3162         --mach->LoopLabelStackTop;
3163         assert(mach->LoopCounterStackTop > 0);
3164         --mach->LoopCounterStackTop;
3165      }
3166      UPDATE_EXEC_MASK(mach);
3167      break;
3168
3169   case TGSI_OPCODE_ENDLOOP:
3170      /* Restore ContMask, but don't pop */
3171      assert(mach->ContStackTop > 0);
3172      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3173      UPDATE_EXEC_MASK(mach);
3174      if (mach->ExecMask) {
3175         /* repeat loop: jump to instruction just past BGNLOOP */
3176         assert(mach->LoopLabelStackTop > 0);
3177         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3178      }
3179      else {
3180         /* exit loop: pop LoopMask */
3181         assert(mach->LoopStackTop > 0);
3182         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3183         /* pop ContMask */
3184         assert(mach->ContStackTop > 0);
3185         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3186         assert(mach->LoopLabelStackTop > 0);
3187         --mach->LoopLabelStackTop;
3188      }
3189      UPDATE_EXEC_MASK(mach);
3190      break;
3191
3192   case TGSI_OPCODE_BRK:
3193      /* turn off loop channels for each enabled exec channel */
3194      mach->LoopMask &= ~mach->ExecMask;
3195      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3196      UPDATE_EXEC_MASK(mach);
3197      break;
3198
3199   case TGSI_OPCODE_CONT:
3200      /* turn off cont channels for each enabled exec channel */
3201      mach->ContMask &= ~mach->ExecMask;
3202      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3203      UPDATE_EXEC_MASK(mach);
3204      break;
3205
3206   case TGSI_OPCODE_BGNSUB:
3207      /* no-op */
3208      break;
3209
3210   case TGSI_OPCODE_ENDSUB:
3211      /* no-op */
3212      break;
3213
3214   case TGSI_OPCODE_NOP:
3215      break;
3216
3217   default:
3218      assert( 0 );
3219   }
3220}
3221
3222
3223/**
3224 * Run TGSI interpreter.
3225 * \return bitmask of "alive" quad components
3226 */
3227uint
3228tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
3229{
3230   uint i;
3231   int pc = 0;
3232
3233   mach->CondMask = 0xf;
3234   mach->LoopMask = 0xf;
3235   mach->ContMask = 0xf;
3236   mach->FuncMask = 0xf;
3237   mach->ExecMask = 0xf;
3238
3239   assert(mach->CondStackTop == 0);
3240   assert(mach->LoopStackTop == 0);
3241   assert(mach->ContStackTop == 0);
3242   assert(mach->CallStackTop == 0);
3243
3244   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
3245   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
3246
3247   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
3248      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
3249      mach->Primitives[0] = 0;
3250   }
3251
3252   for (i = 0; i < QUAD_SIZE; i++) {
3253      mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
3254         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
3255         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
3256         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
3257         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
3258   }
3259
3260   /* execute declarations (interpolants) */
3261   for (i = 0; i < mach->NumDeclarations; i++) {
3262      exec_declaration( mach, mach->Declarations+i );
3263   }
3264
3265   /* execute instructions, until pc is set to -1 */
3266   while (pc != -1) {
3267      assert(pc < (int) mach->NumInstructions);
3268      exec_instruction( mach, mach->Instructions + pc, &pc );
3269   }
3270
3271#if 0
3272   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
3273   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
3274      /*
3275       * Scale back depth component.
3276       */
3277      for (i = 0; i < 4; i++)
3278         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
3279   }
3280#endif
3281
3282   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3283}
3284