tgsi_exec.c revision 823aac36d5580ea46f76ccec3fd31c91f168274e
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28/**
29 * TGSI interpreter/executor.
30 *
31 * Flow control information:
32 *
33 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
34 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
35 * care since a condition may be true for some quad components but false
36 * for other components.
37 *
38 * We basically execute all statements (even if they're in the part of
39 * an IF/ELSE clause that's "not taken") and use a special mask to
40 * control writing to destination registers.  This is the ExecMask.
41 * See store_dest().
42 *
43 * The ExecMask is computed from three other masks (CondMask, LoopMask and
44 * ContMask) which are controlled by the flow control instructions (namely:
45 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
46 *
47 *
48 * Authors:
49 *   Michal Krol
50 *   Brian Paul
51 */
52
53#include "pipe/p_compiler.h"
54#include "pipe/p_state.h"
55#include "pipe/p_shader_tokens.h"
56#include "tgsi/tgsi_parse.h"
57#include "tgsi/tgsi_util.h"
58#include "tgsi_exec.h"
59#include "util/u_memory.h"
60#include "util/u_math.h"
61
62#define FAST_MATH 1
63
64#define TILE_TOP_LEFT     0
65#define TILE_TOP_RIGHT    1
66#define TILE_BOTTOM_LEFT  2
67#define TILE_BOTTOM_RIGHT 3
68
69#define CHAN_X  0
70#define CHAN_Y  1
71#define CHAN_Z  2
72#define CHAN_W  3
73
74/*
75 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
76 */
77#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
78#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
79#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
80#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
81#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
82#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
83#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
84#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
85#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
86#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
87#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
88#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
89#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
90#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
91#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
92#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
93#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
94#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
95#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
96#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
97#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
98#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
99#define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
100#define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
101#define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
102#define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
103#define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
104#define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
105#define TEMP_R0            TGSI_EXEC_TEMP_R0
106
107#define IS_CHANNEL_ENABLED(INST, CHAN)\
108   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
109
110#define IS_CHANNEL_ENABLED2(INST, CHAN)\
111   ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
112
113#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
114   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
115      if (IS_CHANNEL_ENABLED( INST, CHAN ))
116
117#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
118   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
119      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
120
121
122/** The execution mask depends on the conditional mask and the loop mask */
123#define UPDATE_EXEC_MASK(MACH) \
124      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
125
126/**
127 * Initialize machine state by expanding tokens to full instructions,
128 * allocating temporary storage, setting up constants, etc.
129 * After this, we can call tgsi_exec_machine_run() many times.
130 */
131void
132tgsi_exec_machine_bind_shader(
133   struct tgsi_exec_machine *mach,
134   const struct tgsi_token *tokens,
135   uint numSamplers,
136   struct tgsi_sampler **samplers)
137{
138   uint k;
139   struct tgsi_parse_context parse;
140   struct tgsi_exec_labels *labels = &mach->Labels;
141   struct tgsi_full_instruction *instructions;
142   struct tgsi_full_declaration *declarations;
143   uint maxInstructions = 10, numInstructions = 0;
144   uint maxDeclarations = 10, numDeclarations = 0;
145   uint instno = 0;
146
147#if 0
148   tgsi_dump(tokens, 0);
149#endif
150
151   util_init_math();
152
153   mach->Tokens = tokens;
154   mach->Samplers = samplers;
155
156   k = tgsi_parse_init (&parse, mach->Tokens);
157   if (k != TGSI_PARSE_OK) {
158      debug_printf( "Problem parsing!\n" );
159      return;
160   }
161
162   mach->Processor = parse.FullHeader.Processor.Processor;
163   mach->ImmLimit = 0;
164   labels->count = 0;
165
166   declarations = (struct tgsi_full_declaration *)
167      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
168
169   if (!declarations) {
170      return;
171   }
172
173   instructions = (struct tgsi_full_instruction *)
174      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
175
176   if (!instructions) {
177      FREE( declarations );
178      return;
179   }
180
181   while( !tgsi_parse_end_of_tokens( &parse ) ) {
182      uint pointer = parse.Position;
183      uint i;
184
185      tgsi_parse_token( &parse );
186      switch( parse.FullToken.Token.Type ) {
187      case TGSI_TOKEN_TYPE_DECLARATION:
188         /* save expanded declaration */
189         if (numDeclarations == maxDeclarations) {
190            declarations = REALLOC(declarations,
191                                   maxDeclarations
192                                   * sizeof(struct tgsi_full_declaration),
193                                   (maxDeclarations + 10)
194                                   * sizeof(struct tgsi_full_declaration));
195            maxDeclarations += 10;
196         }
197         memcpy(declarations + numDeclarations,
198                &parse.FullToken.FullDeclaration,
199                sizeof(declarations[0]));
200         numDeclarations++;
201         break;
202
203      case TGSI_TOKEN_TYPE_IMMEDIATE:
204         {
205            uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
206            assert( size % 4 == 0 );
207            assert( mach->ImmLimit + size / 4 <= TGSI_EXEC_NUM_IMMEDIATES );
208
209            for( i = 0; i < size; i++ ) {
210               mach->Imms[mach->ImmLimit + i / 4][i % 4] =
211		  parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
212            }
213            mach->ImmLimit += size / 4;
214         }
215         break;
216
217      case TGSI_TOKEN_TYPE_INSTRUCTION:
218         assert( labels->count < MAX_LABELS );
219
220         labels->labels[labels->count][0] = instno;
221         labels->labels[labels->count][1] = pointer;
222         labels->count++;
223
224         /* save expanded instruction */
225         if (numInstructions == maxInstructions) {
226            instructions = REALLOC(instructions,
227                                   maxInstructions
228                                   * sizeof(struct tgsi_full_instruction),
229                                   (maxInstructions + 10)
230                                   * sizeof(struct tgsi_full_instruction));
231            maxInstructions += 10;
232         }
233         memcpy(instructions + numInstructions,
234                &parse.FullToken.FullInstruction,
235                sizeof(instructions[0]));
236         numInstructions++;
237         break;
238
239      default:
240         assert( 0 );
241      }
242   }
243   tgsi_parse_free (&parse);
244
245   if (mach->Declarations) {
246      FREE( mach->Declarations );
247   }
248   mach->Declarations = declarations;
249   mach->NumDeclarations = numDeclarations;
250
251   if (mach->Instructions) {
252      FREE( mach->Instructions );
253   }
254   mach->Instructions = instructions;
255   mach->NumInstructions = numInstructions;
256}
257
258
259void
260tgsi_exec_machine_init(
261   struct tgsi_exec_machine *mach )
262{
263   uint i;
264
265   mach->Temps = (struct tgsi_exec_vector *) tgsi_align_128bit( mach->_Temps);
266   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
267
268   /* Setup constants. */
269   for( i = 0; i < 4; i++ ) {
270      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
271      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
272      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
273      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
274      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
275      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
276      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
277      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
278      mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
279      mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
280   }
281}
282
283
284void
285tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach)
286{
287   if (mach->Instructions) {
288      FREE(mach->Instructions);
289      mach->Instructions = NULL;
290      mach->NumInstructions = 0;
291   }
292   if (mach->Declarations) {
293      FREE(mach->Declarations);
294      mach->Declarations = NULL;
295      mach->NumDeclarations = 0;
296   }
297}
298
299
300static void
301micro_abs(
302   union tgsi_exec_channel *dst,
303   const union tgsi_exec_channel *src )
304{
305   dst->f[0] = fabsf( src->f[0] );
306   dst->f[1] = fabsf( src->f[1] );
307   dst->f[2] = fabsf( src->f[2] );
308   dst->f[3] = fabsf( src->f[3] );
309}
310
311static void
312micro_add(
313   union tgsi_exec_channel *dst,
314   const union tgsi_exec_channel *src0,
315   const union tgsi_exec_channel *src1 )
316{
317   dst->f[0] = src0->f[0] + src1->f[0];
318   dst->f[1] = src0->f[1] + src1->f[1];
319   dst->f[2] = src0->f[2] + src1->f[2];
320   dst->f[3] = src0->f[3] + src1->f[3];
321}
322
323static void
324micro_iadd(
325   union tgsi_exec_channel *dst,
326   const union tgsi_exec_channel *src0,
327   const union tgsi_exec_channel *src1 )
328{
329   dst->i[0] = src0->i[0] + src1->i[0];
330   dst->i[1] = src0->i[1] + src1->i[1];
331   dst->i[2] = src0->i[2] + src1->i[2];
332   dst->i[3] = src0->i[3] + src1->i[3];
333}
334
335static void
336micro_and(
337   union tgsi_exec_channel *dst,
338   const union tgsi_exec_channel *src0,
339   const union tgsi_exec_channel *src1 )
340{
341   dst->u[0] = src0->u[0] & src1->u[0];
342   dst->u[1] = src0->u[1] & src1->u[1];
343   dst->u[2] = src0->u[2] & src1->u[2];
344   dst->u[3] = src0->u[3] & src1->u[3];
345}
346
347static void
348micro_ceil(
349   union tgsi_exec_channel *dst,
350   const union tgsi_exec_channel *src )
351{
352   dst->f[0] = ceilf( src->f[0] );
353   dst->f[1] = ceilf( src->f[1] );
354   dst->f[2] = ceilf( src->f[2] );
355   dst->f[3] = ceilf( src->f[3] );
356}
357
358static void
359micro_cos(
360   union tgsi_exec_channel *dst,
361   const union tgsi_exec_channel *src )
362{
363   dst->f[0] = cosf( src->f[0] );
364   dst->f[1] = cosf( src->f[1] );
365   dst->f[2] = cosf( src->f[2] );
366   dst->f[3] = cosf( src->f[3] );
367}
368
369static void
370micro_ddx(
371   union tgsi_exec_channel *dst,
372   const union tgsi_exec_channel *src )
373{
374   dst->f[0] =
375   dst->f[1] =
376   dst->f[2] =
377   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
378}
379
380static void
381micro_ddy(
382   union tgsi_exec_channel *dst,
383   const union tgsi_exec_channel *src )
384{
385   dst->f[0] =
386   dst->f[1] =
387   dst->f[2] =
388   dst->f[3] = src->f[TILE_TOP_LEFT] - src->f[TILE_BOTTOM_LEFT];
389}
390
391static void
392micro_div(
393   union tgsi_exec_channel *dst,
394   const union tgsi_exec_channel *src0,
395   const union tgsi_exec_channel *src1 )
396{
397   if (src1->f[0] != 0) {
398      dst->f[0] = src0->f[0] / src1->f[0];
399   }
400   if (src1->f[1] != 0) {
401      dst->f[1] = src0->f[1] / src1->f[1];
402   }
403   if (src1->f[2] != 0) {
404      dst->f[2] = src0->f[2] / src1->f[2];
405   }
406   if (src1->f[3] != 0) {
407      dst->f[3] = src0->f[3] / src1->f[3];
408   }
409}
410
411static void
412micro_udiv(
413   union tgsi_exec_channel *dst,
414   const union tgsi_exec_channel *src0,
415   const union tgsi_exec_channel *src1 )
416{
417   dst->u[0] = src0->u[0] / src1->u[0];
418   dst->u[1] = src0->u[1] / src1->u[1];
419   dst->u[2] = src0->u[2] / src1->u[2];
420   dst->u[3] = src0->u[3] / src1->u[3];
421}
422
423static void
424micro_eq(
425   union tgsi_exec_channel *dst,
426   const union tgsi_exec_channel *src0,
427   const union tgsi_exec_channel *src1,
428   const union tgsi_exec_channel *src2,
429   const union tgsi_exec_channel *src3 )
430{
431   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
432   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
433   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
434   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
435}
436
437static void
438micro_ieq(
439   union tgsi_exec_channel *dst,
440   const union tgsi_exec_channel *src0,
441   const union tgsi_exec_channel *src1,
442   const union tgsi_exec_channel *src2,
443   const union tgsi_exec_channel *src3 )
444{
445   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
446   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
447   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
448   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
449}
450
451static void
452micro_exp2(
453   union tgsi_exec_channel *dst,
454   const union tgsi_exec_channel *src)
455{
456#if FAST_MATH
457   dst->f[0] = util_fast_exp2( src->f[0] );
458   dst->f[1] = util_fast_exp2( src->f[1] );
459   dst->f[2] = util_fast_exp2( src->f[2] );
460   dst->f[3] = util_fast_exp2( src->f[3] );
461#else
462   dst->f[0] = powf( 2.0f, src->f[0] );
463   dst->f[1] = powf( 2.0f, src->f[1] );
464   dst->f[2] = powf( 2.0f, src->f[2] );
465   dst->f[3] = powf( 2.0f, src->f[3] );
466#endif
467}
468
469static void
470micro_f2ut(
471   union tgsi_exec_channel *dst,
472   const union tgsi_exec_channel *src )
473{
474   dst->u[0] = (uint) src->f[0];
475   dst->u[1] = (uint) src->f[1];
476   dst->u[2] = (uint) src->f[2];
477   dst->u[3] = (uint) src->f[3];
478}
479
480static void
481micro_flr(
482   union tgsi_exec_channel *dst,
483   const union tgsi_exec_channel *src )
484{
485   dst->f[0] = floorf( src->f[0] );
486   dst->f[1] = floorf( src->f[1] );
487   dst->f[2] = floorf( src->f[2] );
488   dst->f[3] = floorf( src->f[3] );
489}
490
491static void
492micro_frc(
493   union tgsi_exec_channel *dst,
494   const union tgsi_exec_channel *src )
495{
496   dst->f[0] = src->f[0] - floorf( src->f[0] );
497   dst->f[1] = src->f[1] - floorf( src->f[1] );
498   dst->f[2] = src->f[2] - floorf( src->f[2] );
499   dst->f[3] = src->f[3] - floorf( src->f[3] );
500}
501
502static void
503micro_ge(
504   union tgsi_exec_channel *dst,
505   const union tgsi_exec_channel *src0,
506   const union tgsi_exec_channel *src1,
507   const union tgsi_exec_channel *src2,
508   const union tgsi_exec_channel *src3 )
509{
510   dst->f[0] = src0->f[0] >= src1->f[0] ? src2->f[0] : src3->f[0];
511   dst->f[1] = src0->f[1] >= src1->f[1] ? src2->f[1] : src3->f[1];
512   dst->f[2] = src0->f[2] >= src1->f[2] ? src2->f[2] : src3->f[2];
513   dst->f[3] = src0->f[3] >= src1->f[3] ? src2->f[3] : src3->f[3];
514}
515
516static void
517micro_i2f(
518   union tgsi_exec_channel *dst,
519   const union tgsi_exec_channel *src )
520{
521   dst->f[0] = (float) src->i[0];
522   dst->f[1] = (float) src->i[1];
523   dst->f[2] = (float) src->i[2];
524   dst->f[3] = (float) src->i[3];
525}
526
527static void
528micro_lg2(
529   union tgsi_exec_channel *dst,
530   const union tgsi_exec_channel *src )
531{
532#if FAST_MATH
533   dst->f[0] = util_fast_log2( src->f[0] );
534   dst->f[1] = util_fast_log2( src->f[1] );
535   dst->f[2] = util_fast_log2( src->f[2] );
536   dst->f[3] = util_fast_log2( src->f[3] );
537#else
538   dst->f[0] = logf( src->f[0] ) * 1.442695f;
539   dst->f[1] = logf( src->f[1] ) * 1.442695f;
540   dst->f[2] = logf( src->f[2] ) * 1.442695f;
541   dst->f[3] = logf( src->f[3] ) * 1.442695f;
542#endif
543}
544
545static void
546micro_le(
547   union tgsi_exec_channel *dst,
548   const union tgsi_exec_channel *src0,
549   const union tgsi_exec_channel *src1,
550   const union tgsi_exec_channel *src2,
551   const union tgsi_exec_channel *src3 )
552{
553   dst->f[0] = src0->f[0] <= src1->f[0] ? src2->f[0] : src3->f[0];
554   dst->f[1] = src0->f[1] <= src1->f[1] ? src2->f[1] : src3->f[1];
555   dst->f[2] = src0->f[2] <= src1->f[2] ? src2->f[2] : src3->f[2];
556   dst->f[3] = src0->f[3] <= src1->f[3] ? src2->f[3] : src3->f[3];
557}
558
559static void
560micro_lt(
561   union tgsi_exec_channel *dst,
562   const union tgsi_exec_channel *src0,
563   const union tgsi_exec_channel *src1,
564   const union tgsi_exec_channel *src2,
565   const union tgsi_exec_channel *src3 )
566{
567   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
568   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
569   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
570   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
571}
572
573static void
574micro_ilt(
575   union tgsi_exec_channel *dst,
576   const union tgsi_exec_channel *src0,
577   const union tgsi_exec_channel *src1,
578   const union tgsi_exec_channel *src2,
579   const union tgsi_exec_channel *src3 )
580{
581   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
582   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
583   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
584   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
585}
586
587static void
588micro_ult(
589   union tgsi_exec_channel *dst,
590   const union tgsi_exec_channel *src0,
591   const union tgsi_exec_channel *src1,
592   const union tgsi_exec_channel *src2,
593   const union tgsi_exec_channel *src3 )
594{
595   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
596   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
597   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
598   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
599}
600
601static void
602micro_max(
603   union tgsi_exec_channel *dst,
604   const union tgsi_exec_channel *src0,
605   const union tgsi_exec_channel *src1 )
606{
607   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
608   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
609   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
610   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
611}
612
613static void
614micro_imax(
615   union tgsi_exec_channel *dst,
616   const union tgsi_exec_channel *src0,
617   const union tgsi_exec_channel *src1 )
618{
619   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
620   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
621   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
622   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
623}
624
625static void
626micro_umax(
627   union tgsi_exec_channel *dst,
628   const union tgsi_exec_channel *src0,
629   const union tgsi_exec_channel *src1 )
630{
631   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
632   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
633   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
634   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
635}
636
637static void
638micro_min(
639   union tgsi_exec_channel *dst,
640   const union tgsi_exec_channel *src0,
641   const union tgsi_exec_channel *src1 )
642{
643   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
644   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
645   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
646   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
647}
648
649static void
650micro_imin(
651   union tgsi_exec_channel *dst,
652   const union tgsi_exec_channel *src0,
653   const union tgsi_exec_channel *src1 )
654{
655   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
656   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
657   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
658   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
659}
660
661static void
662micro_umin(
663   union tgsi_exec_channel *dst,
664   const union tgsi_exec_channel *src0,
665   const union tgsi_exec_channel *src1 )
666{
667   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
668   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
669   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
670   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
671}
672
673static void
674micro_umod(
675   union tgsi_exec_channel *dst,
676   const union tgsi_exec_channel *src0,
677   const union tgsi_exec_channel *src1 )
678{
679   dst->u[0] = src0->u[0] % src1->u[0];
680   dst->u[1] = src0->u[1] % src1->u[1];
681   dst->u[2] = src0->u[2] % src1->u[2];
682   dst->u[3] = src0->u[3] % src1->u[3];
683}
684
685static void
686micro_mul(
687   union tgsi_exec_channel *dst,
688   const union tgsi_exec_channel *src0,
689   const union tgsi_exec_channel *src1 )
690{
691   dst->f[0] = src0->f[0] * src1->f[0];
692   dst->f[1] = src0->f[1] * src1->f[1];
693   dst->f[2] = src0->f[2] * src1->f[2];
694   dst->f[3] = src0->f[3] * src1->f[3];
695}
696
697static void
698micro_imul(
699   union tgsi_exec_channel *dst,
700   const union tgsi_exec_channel *src0,
701   const union tgsi_exec_channel *src1 )
702{
703   dst->i[0] = src0->i[0] * src1->i[0];
704   dst->i[1] = src0->i[1] * src1->i[1];
705   dst->i[2] = src0->i[2] * src1->i[2];
706   dst->i[3] = src0->i[3] * src1->i[3];
707}
708
709static void
710micro_imul64(
711   union tgsi_exec_channel *dst0,
712   union tgsi_exec_channel *dst1,
713   const union tgsi_exec_channel *src0,
714   const union tgsi_exec_channel *src1 )
715{
716   dst1->i[0] = src0->i[0] * src1->i[0];
717   dst1->i[1] = src0->i[1] * src1->i[1];
718   dst1->i[2] = src0->i[2] * src1->i[2];
719   dst1->i[3] = src0->i[3] * src1->i[3];
720   dst0->i[0] = 0;
721   dst0->i[1] = 0;
722   dst0->i[2] = 0;
723   dst0->i[3] = 0;
724}
725
726static void
727micro_umul64(
728   union tgsi_exec_channel *dst0,
729   union tgsi_exec_channel *dst1,
730   const union tgsi_exec_channel *src0,
731   const union tgsi_exec_channel *src1 )
732{
733   dst1->u[0] = src0->u[0] * src1->u[0];
734   dst1->u[1] = src0->u[1] * src1->u[1];
735   dst1->u[2] = src0->u[2] * src1->u[2];
736   dst1->u[3] = src0->u[3] * src1->u[3];
737   dst0->u[0] = 0;
738   dst0->u[1] = 0;
739   dst0->u[2] = 0;
740   dst0->u[3] = 0;
741}
742
743static void
744micro_movc(
745   union tgsi_exec_channel *dst,
746   const union tgsi_exec_channel *src0,
747   const union tgsi_exec_channel *src1,
748   const union tgsi_exec_channel *src2 )
749{
750   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
751   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
752   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
753   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
754}
755
756static void
757micro_neg(
758   union tgsi_exec_channel *dst,
759   const union tgsi_exec_channel *src )
760{
761   dst->f[0] = -src->f[0];
762   dst->f[1] = -src->f[1];
763   dst->f[2] = -src->f[2];
764   dst->f[3] = -src->f[3];
765}
766
767static void
768micro_ineg(
769   union tgsi_exec_channel *dst,
770   const union tgsi_exec_channel *src )
771{
772   dst->i[0] = -src->i[0];
773   dst->i[1] = -src->i[1];
774   dst->i[2] = -src->i[2];
775   dst->i[3] = -src->i[3];
776}
777
778static void
779micro_not(
780   union tgsi_exec_channel *dst,
781   const union tgsi_exec_channel *src )
782{
783   dst->u[0] = ~src->u[0];
784   dst->u[1] = ~src->u[1];
785   dst->u[2] = ~src->u[2];
786   dst->u[3] = ~src->u[3];
787}
788
789static void
790micro_or(
791   union tgsi_exec_channel *dst,
792   const union tgsi_exec_channel *src0,
793   const union tgsi_exec_channel *src1 )
794{
795   dst->u[0] = src0->u[0] | src1->u[0];
796   dst->u[1] = src0->u[1] | src1->u[1];
797   dst->u[2] = src0->u[2] | src1->u[2];
798   dst->u[3] = src0->u[3] | src1->u[3];
799}
800
801static void
802micro_pow(
803   union tgsi_exec_channel *dst,
804   const union tgsi_exec_channel *src0,
805   const union tgsi_exec_channel *src1 )
806{
807#if FAST_MATH
808   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
809   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
810   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
811   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
812#else
813   dst->f[0] = powf( src0->f[0], src1->f[0] );
814   dst->f[1] = powf( src0->f[1], src1->f[1] );
815   dst->f[2] = powf( src0->f[2], src1->f[2] );
816   dst->f[3] = powf( src0->f[3], src1->f[3] );
817#endif
818}
819
820static void
821micro_rnd(
822   union tgsi_exec_channel *dst,
823   const union tgsi_exec_channel *src )
824{
825   dst->f[0] = floorf( src->f[0] + 0.5f );
826   dst->f[1] = floorf( src->f[1] + 0.5f );
827   dst->f[2] = floorf( src->f[2] + 0.5f );
828   dst->f[3] = floorf( src->f[3] + 0.5f );
829}
830
831static void
832micro_sgn(
833   union tgsi_exec_channel *dst,
834   const union tgsi_exec_channel *src )
835{
836   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
837   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
838   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
839   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
840}
841
842static void
843micro_shl(
844   union tgsi_exec_channel *dst,
845   const union tgsi_exec_channel *src0,
846   const union tgsi_exec_channel *src1 )
847{
848   dst->i[0] = src0->i[0] << src1->i[0];
849   dst->i[1] = src0->i[1] << src1->i[1];
850   dst->i[2] = src0->i[2] << src1->i[2];
851   dst->i[3] = src0->i[3] << src1->i[3];
852}
853
854static void
855micro_ishr(
856   union tgsi_exec_channel *dst,
857   const union tgsi_exec_channel *src0,
858   const union tgsi_exec_channel *src1 )
859{
860   dst->i[0] = src0->i[0] >> src1->i[0];
861   dst->i[1] = src0->i[1] >> src1->i[1];
862   dst->i[2] = src0->i[2] >> src1->i[2];
863   dst->i[3] = src0->i[3] >> src1->i[3];
864}
865
866static void
867micro_trunc(
868   union tgsi_exec_channel *dst,
869   const union tgsi_exec_channel *src0 )
870{
871   dst->f[0] = (float) (int) src0->f[0];
872   dst->f[1] = (float) (int) src0->f[1];
873   dst->f[2] = (float) (int) src0->f[2];
874   dst->f[3] = (float) (int) src0->f[3];
875}
876
877static void
878micro_ushr(
879   union tgsi_exec_channel *dst,
880   const union tgsi_exec_channel *src0,
881   const union tgsi_exec_channel *src1 )
882{
883   dst->u[0] = src0->u[0] >> src1->u[0];
884   dst->u[1] = src0->u[1] >> src1->u[1];
885   dst->u[2] = src0->u[2] >> src1->u[2];
886   dst->u[3] = src0->u[3] >> src1->u[3];
887}
888
889static void
890micro_sin(
891   union tgsi_exec_channel *dst,
892   const union tgsi_exec_channel *src )
893{
894   dst->f[0] = sinf( src->f[0] );
895   dst->f[1] = sinf( src->f[1] );
896   dst->f[2] = sinf( src->f[2] );
897   dst->f[3] = sinf( src->f[3] );
898}
899
900static void
901micro_sqrt( union tgsi_exec_channel *dst,
902            const union tgsi_exec_channel *src )
903{
904   dst->f[0] = sqrtf( src->f[0] );
905   dst->f[1] = sqrtf( src->f[1] );
906   dst->f[2] = sqrtf( src->f[2] );
907   dst->f[3] = sqrtf( src->f[3] );
908}
909
910static void
911micro_sub(
912   union tgsi_exec_channel *dst,
913   const union tgsi_exec_channel *src0,
914   const union tgsi_exec_channel *src1 )
915{
916   dst->f[0] = src0->f[0] - src1->f[0];
917   dst->f[1] = src0->f[1] - src1->f[1];
918   dst->f[2] = src0->f[2] - src1->f[2];
919   dst->f[3] = src0->f[3] - src1->f[3];
920}
921
922static void
923micro_u2f(
924   union tgsi_exec_channel *dst,
925   const union tgsi_exec_channel *src )
926{
927   dst->f[0] = (float) src->u[0];
928   dst->f[1] = (float) src->u[1];
929   dst->f[2] = (float) src->u[2];
930   dst->f[3] = (float) src->u[3];
931}
932
933static void
934micro_xor(
935   union tgsi_exec_channel *dst,
936   const union tgsi_exec_channel *src0,
937   const union tgsi_exec_channel *src1 )
938{
939   dst->u[0] = src0->u[0] ^ src1->u[0];
940   dst->u[1] = src0->u[1] ^ src1->u[1];
941   dst->u[2] = src0->u[2] ^ src1->u[2];
942   dst->u[3] = src0->u[3] ^ src1->u[3];
943}
944
945static void
946fetch_src_file_channel(
947   const struct tgsi_exec_machine *mach,
948   const uint file,
949   const uint swizzle,
950   const union tgsi_exec_channel *index,
951   union tgsi_exec_channel *chan )
952{
953   switch( swizzle ) {
954   case TGSI_EXTSWIZZLE_X:
955   case TGSI_EXTSWIZZLE_Y:
956   case TGSI_EXTSWIZZLE_Z:
957   case TGSI_EXTSWIZZLE_W:
958      switch( file ) {
959      case TGSI_FILE_CONSTANT:
960         assert(mach->Consts);
961         if (index->i[0] < 0)
962            chan->f[0] = 0.0f;
963         else
964            chan->f[0] = mach->Consts[index->i[0]][swizzle];
965         if (index->i[1] < 0)
966            chan->f[1] = 0.0f;
967         else
968            chan->f[1] = mach->Consts[index->i[1]][swizzle];
969         if (index->i[2] < 0)
970            chan->f[2] = 0.0f;
971         else
972            chan->f[2] = mach->Consts[index->i[2]][swizzle];
973         if (index->i[3] < 0)
974            chan->f[3] = 0.0f;
975         else
976            chan->f[3] = mach->Consts[index->i[3]][swizzle];
977         break;
978
979      case TGSI_FILE_INPUT:
980         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
981         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
982         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
983         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
984         break;
985
986      case TGSI_FILE_TEMPORARY:
987         assert(index->i[0] < TGSI_EXEC_NUM_TEMPS);
988         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
989         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
990         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
991         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
992         break;
993
994      case TGSI_FILE_IMMEDIATE:
995         assert( index->i[0] < (int) mach->ImmLimit );
996         chan->f[0] = mach->Imms[index->i[0]][swizzle];
997         assert( index->i[1] < (int) mach->ImmLimit );
998         chan->f[1] = mach->Imms[index->i[1]][swizzle];
999         assert( index->i[2] < (int) mach->ImmLimit );
1000         chan->f[2] = mach->Imms[index->i[2]][swizzle];
1001         assert( index->i[3] < (int) mach->ImmLimit );
1002         chan->f[3] = mach->Imms[index->i[3]][swizzle];
1003         break;
1004
1005      case TGSI_FILE_ADDRESS:
1006         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
1007         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
1008         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
1009         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
1010         break;
1011
1012      case TGSI_FILE_OUTPUT:
1013         /* vertex/fragment output vars can be read too */
1014         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
1015         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
1016         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
1017         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
1018         break;
1019
1020      default:
1021         assert( 0 );
1022      }
1023      break;
1024
1025   case TGSI_EXTSWIZZLE_ZERO:
1026      *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C];
1027      break;
1028
1029   case TGSI_EXTSWIZZLE_ONE:
1030      *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C];
1031      break;
1032
1033   default:
1034      assert( 0 );
1035   }
1036}
1037
1038static void
1039fetch_source(
1040   const struct tgsi_exec_machine *mach,
1041   union tgsi_exec_channel *chan,
1042   const struct tgsi_full_src_register *reg,
1043   const uint chan_index )
1044{
1045   union tgsi_exec_channel index;
1046   uint swizzle;
1047
1048   /* We start with a direct index into a register file.
1049    *
1050    *    file[1],
1051    *    where:
1052    *       file = SrcRegister.File
1053    *       [1] = SrcRegister.Index
1054    */
1055   index.i[0] =
1056   index.i[1] =
1057   index.i[2] =
1058   index.i[3] = reg->SrcRegister.Index;
1059
1060   /* There is an extra source register that indirectly subscripts
1061    * a register file. The direct index now becomes an offset
1062    * that is being added to the indirect register.
1063    *
1064    *    file[ind[2].x+1],
1065    *    where:
1066    *       ind = SrcRegisterInd.File
1067    *       [2] = SrcRegisterInd.Index
1068    *       .x = SrcRegisterInd.SwizzleX
1069    */
1070   if (reg->SrcRegister.Indirect) {
1071      union tgsi_exec_channel index2;
1072      union tgsi_exec_channel indir_index;
1073      const uint execmask = mach->ExecMask;
1074      uint i;
1075
1076      /* which address register (always zero now) */
1077      index2.i[0] =
1078      index2.i[1] =
1079      index2.i[2] =
1080      index2.i[3] = reg->SrcRegisterInd.Index;
1081
1082      /* get current value of address register[swizzle] */
1083      swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterInd, CHAN_X );
1084      fetch_src_file_channel(
1085         mach,
1086         reg->SrcRegisterInd.File,
1087         swizzle,
1088         &index2,
1089         &indir_index );
1090
1091      /* add value of address register to the offset */
1092      index.i[0] += (int) indir_index.f[0];
1093      index.i[1] += (int) indir_index.f[1];
1094      index.i[2] += (int) indir_index.f[2];
1095      index.i[3] += (int) indir_index.f[3];
1096
1097      /* for disabled execution channels, zero-out the index to
1098       * avoid using a potential garbage value.
1099       */
1100      for (i = 0; i < QUAD_SIZE; i++) {
1101         if ((execmask & (1 << i)) == 0)
1102            index.i[i] = 0;
1103      }
1104   }
1105
1106   /* There is an extra source register that is a second
1107    * subscript to a register file. Effectively it means that
1108    * the register file is actually a 2D array of registers.
1109    *
1110    *    file[1][3] == file[1*sizeof(file[1])+3],
1111    *    where:
1112    *       [3] = SrcRegisterDim.Index
1113    */
1114   if (reg->SrcRegister.Dimension) {
1115      /* The size of the first-order array depends on the register file type.
1116       * We need to multiply the index to the first array to get an effective,
1117       * "flat" index that points to the beginning of the second-order array.
1118       */
1119      switch (reg->SrcRegister.File) {
1120      case TGSI_FILE_INPUT:
1121         index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1122         index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1123         index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1124         index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1125         break;
1126      case TGSI_FILE_CONSTANT:
1127         index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
1128         index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
1129         index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
1130         index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
1131         break;
1132      default:
1133         assert( 0 );
1134      }
1135
1136      index.i[0] += reg->SrcRegisterDim.Index;
1137      index.i[1] += reg->SrcRegisterDim.Index;
1138      index.i[2] += reg->SrcRegisterDim.Index;
1139      index.i[3] += reg->SrcRegisterDim.Index;
1140
1141      /* Again, the second subscript index can be addressed indirectly
1142       * identically to the first one.
1143       * Nothing stops us from indirectly addressing the indirect register,
1144       * but there is no need for that, so we won't exercise it.
1145       *
1146       *    file[1][ind[4].y+3],
1147       *    where:
1148       *       ind = SrcRegisterDimInd.File
1149       *       [4] = SrcRegisterDimInd.Index
1150       *       .y = SrcRegisterDimInd.SwizzleX
1151       */
1152      if (reg->SrcRegisterDim.Indirect) {
1153         union tgsi_exec_channel index2;
1154         union tgsi_exec_channel indir_index;
1155         const uint execmask = mach->ExecMask;
1156         uint i;
1157
1158         index2.i[0] =
1159         index2.i[1] =
1160         index2.i[2] =
1161         index2.i[3] = reg->SrcRegisterDimInd.Index;
1162
1163         swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
1164         fetch_src_file_channel(
1165            mach,
1166            reg->SrcRegisterDimInd.File,
1167            swizzle,
1168            &index2,
1169            &indir_index );
1170
1171         index.i[0] += (int) indir_index.f[0];
1172         index.i[1] += (int) indir_index.f[1];
1173         index.i[2] += (int) indir_index.f[2];
1174         index.i[3] += (int) indir_index.f[3];
1175
1176         /* for disabled execution channels, zero-out the index to
1177          * avoid using a potential garbage value.
1178          */
1179         for (i = 0; i < QUAD_SIZE; i++) {
1180            if ((execmask & (1 << i)) == 0)
1181               index.i[i] = 0;
1182         }
1183      }
1184
1185      /* If by any chance there was a need for a 3D array of register
1186       * files, we would have to check whether SrcRegisterDim is followed
1187       * by a dimension register and continue the saga.
1188       */
1189   }
1190
1191   swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1192   fetch_src_file_channel(
1193      mach,
1194      reg->SrcRegister.File,
1195      swizzle,
1196      &index,
1197      chan );
1198
1199   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
1200   case TGSI_UTIL_SIGN_CLEAR:
1201      micro_abs( chan, chan );
1202      break;
1203
1204   case TGSI_UTIL_SIGN_SET:
1205      micro_abs( chan, chan );
1206      micro_neg( chan, chan );
1207      break;
1208
1209   case TGSI_UTIL_SIGN_TOGGLE:
1210      micro_neg( chan, chan );
1211      break;
1212
1213   case TGSI_UTIL_SIGN_KEEP:
1214      break;
1215   }
1216
1217   if (reg->SrcRegisterExtMod.Complement) {
1218      micro_sub( chan, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], chan );
1219   }
1220}
1221
1222static void
1223store_dest(
1224   struct tgsi_exec_machine *mach,
1225   const union tgsi_exec_channel *chan,
1226   const struct tgsi_full_dst_register *reg,
1227   const struct tgsi_full_instruction *inst,
1228   uint chan_index )
1229{
1230   uint i;
1231   union tgsi_exec_channel null;
1232   union tgsi_exec_channel *dst;
1233   uint execmask = mach->ExecMask;
1234
1235   switch (reg->DstRegister.File) {
1236   case TGSI_FILE_NULL:
1237      dst = &null;
1238      break;
1239
1240   case TGSI_FILE_OUTPUT:
1241      dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1242                           + reg->DstRegister.Index].xyzw[chan_index];
1243      break;
1244
1245   case TGSI_FILE_TEMPORARY:
1246      assert( reg->DstRegister.Index < TGSI_EXEC_NUM_TEMPS );
1247      dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index];
1248      break;
1249
1250   case TGSI_FILE_ADDRESS:
1251      dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index];
1252      break;
1253
1254   default:
1255      assert( 0 );
1256      return;
1257   }
1258
1259   if (inst->InstructionExtNv.CondFlowEnable) {
1260      union tgsi_exec_channel *cc = &mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C];
1261      uint swizzle;
1262      uint shift;
1263      uint mask;
1264      uint test;
1265
1266      /* Only CC0 supported.
1267       */
1268      assert( inst->InstructionExtNv.CondFlowIndex < 1 );
1269
1270      switch (chan_index) {
1271      case CHAN_X:
1272         swizzle = inst->InstructionExtNv.CondSwizzleX;
1273         break;
1274      case CHAN_Y:
1275         swizzle = inst->InstructionExtNv.CondSwizzleY;
1276         break;
1277      case CHAN_Z:
1278         swizzle = inst->InstructionExtNv.CondSwizzleZ;
1279         break;
1280      case CHAN_W:
1281         swizzle = inst->InstructionExtNv.CondSwizzleW;
1282         break;
1283      default:
1284         assert( 0 );
1285         return;
1286      }
1287
1288      switch (swizzle) {
1289      case TGSI_SWIZZLE_X:
1290         shift = TGSI_EXEC_CC_X_SHIFT;
1291         mask = TGSI_EXEC_CC_X_MASK;
1292         break;
1293      case TGSI_SWIZZLE_Y:
1294         shift = TGSI_EXEC_CC_Y_SHIFT;
1295         mask = TGSI_EXEC_CC_Y_MASK;
1296         break;
1297      case TGSI_SWIZZLE_Z:
1298         shift = TGSI_EXEC_CC_Z_SHIFT;
1299         mask = TGSI_EXEC_CC_Z_MASK;
1300         break;
1301      case TGSI_SWIZZLE_W:
1302         shift = TGSI_EXEC_CC_W_SHIFT;
1303         mask = TGSI_EXEC_CC_W_MASK;
1304         break;
1305      default:
1306         assert( 0 );
1307         return;
1308      }
1309
1310      switch (inst->InstructionExtNv.CondMask) {
1311      case TGSI_CC_GT:
1312         test = ~(TGSI_EXEC_CC_GT << shift) & mask;
1313         for (i = 0; i < QUAD_SIZE; i++)
1314            if (cc->u[i] & test)
1315               execmask &= ~(1 << i);
1316         break;
1317
1318      case TGSI_CC_EQ:
1319         test = ~(TGSI_EXEC_CC_EQ << shift) & mask;
1320         for (i = 0; i < QUAD_SIZE; i++)
1321            if (cc->u[i] & test)
1322               execmask &= ~(1 << i);
1323         break;
1324
1325      case TGSI_CC_LT:
1326         test = ~(TGSI_EXEC_CC_LT << shift) & mask;
1327         for (i = 0; i < QUAD_SIZE; i++)
1328            if (cc->u[i] & test)
1329               execmask &= ~(1 << i);
1330         break;
1331
1332      case TGSI_CC_GE:
1333         test = ~((TGSI_EXEC_CC_GT | TGSI_EXEC_CC_EQ) << shift) & mask;
1334         for (i = 0; i < QUAD_SIZE; i++)
1335            if (cc->u[i] & test)
1336               execmask &= ~(1 << i);
1337         break;
1338
1339      case TGSI_CC_LE:
1340         test = ~((TGSI_EXEC_CC_LT | TGSI_EXEC_CC_EQ) << shift) & mask;
1341         for (i = 0; i < QUAD_SIZE; i++)
1342            if (cc->u[i] & test)
1343               execmask &= ~(1 << i);
1344         break;
1345
1346      case TGSI_CC_NE:
1347         test = ~((TGSI_EXEC_CC_GT | TGSI_EXEC_CC_LT | TGSI_EXEC_CC_UN) << shift) & mask;
1348         for (i = 0; i < QUAD_SIZE; i++)
1349            if (cc->u[i] & test)
1350               execmask &= ~(1 << i);
1351         break;
1352
1353      case TGSI_CC_TR:
1354         break;
1355
1356      case TGSI_CC_FL:
1357         for (i = 0; i < QUAD_SIZE; i++)
1358            execmask &= ~(1 << i);
1359         break;
1360
1361      default:
1362         assert( 0 );
1363         return;
1364      }
1365   }
1366
1367   switch (inst->Instruction.Saturate) {
1368   case TGSI_SAT_NONE:
1369      for (i = 0; i < QUAD_SIZE; i++)
1370         if (execmask & (1 << i))
1371            dst->i[i] = chan->i[i];
1372      break;
1373
1374   case TGSI_SAT_ZERO_ONE:
1375      for (i = 0; i < QUAD_SIZE; i++)
1376         if (execmask & (1 << i)) {
1377            if (chan->f[i] < 0.0f)
1378               dst->f[i] = 0.0f;
1379            else if (chan->f[i] > 1.0f)
1380               dst->f[i] = 1.0f;
1381            else
1382               dst->i[i] = chan->i[i];
1383         }
1384      break;
1385
1386   case TGSI_SAT_MINUS_PLUS_ONE:
1387      for (i = 0; i < QUAD_SIZE; i++)
1388         if (execmask & (1 << i)) {
1389            if (chan->f[i] < -1.0f)
1390               dst->f[i] = -1.0f;
1391            else if (chan->f[i] > 1.0f)
1392               dst->f[i] = 1.0f;
1393            else
1394               dst->i[i] = chan->i[i];
1395         }
1396      break;
1397
1398   default:
1399      assert( 0 );
1400   }
1401
1402   if (inst->InstructionExtNv.CondDstUpdate) {
1403      union tgsi_exec_channel *cc = &mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C];
1404      uint shift;
1405      uint mask;
1406
1407      /* Only CC0 supported.
1408       */
1409      assert( inst->InstructionExtNv.CondDstIndex < 1 );
1410
1411      switch (chan_index) {
1412      case CHAN_X:
1413         shift = TGSI_EXEC_CC_X_SHIFT;
1414         mask = ~TGSI_EXEC_CC_X_MASK;
1415         break;
1416      case CHAN_Y:
1417         shift = TGSI_EXEC_CC_Y_SHIFT;
1418         mask = ~TGSI_EXEC_CC_Y_MASK;
1419         break;
1420      case CHAN_Z:
1421         shift = TGSI_EXEC_CC_Z_SHIFT;
1422         mask = ~TGSI_EXEC_CC_Z_MASK;
1423         break;
1424      case CHAN_W:
1425         shift = TGSI_EXEC_CC_W_SHIFT;
1426         mask = ~TGSI_EXEC_CC_W_MASK;
1427         break;
1428      default:
1429         assert( 0 );
1430         return;
1431      }
1432
1433      for (i = 0; i < QUAD_SIZE; i++)
1434         if (execmask & (1 << i)) {
1435            cc->u[i] &= mask;
1436            if (dst->f[i] < 0.0f)
1437               cc->u[i] |= TGSI_EXEC_CC_LT << shift;
1438            else if (dst->f[i] > 0.0f)
1439               cc->u[i] |= TGSI_EXEC_CC_GT << shift;
1440            else if (dst->f[i] == 0.0f)
1441               cc->u[i] |= TGSI_EXEC_CC_EQ << shift;
1442            else
1443               cc->u[i] |= TGSI_EXEC_CC_UN << shift;
1444         }
1445   }
1446}
1447
1448#define FETCH(VAL,INDEX,CHAN)\
1449    fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
1450
1451#define STORE(VAL,INDEX,CHAN)\
1452    store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
1453
1454
1455/**
1456 * Execute ARB-style KIL which is predicated by a src register.
1457 * Kill fragment if any of the four values is less than zero.
1458 */
1459static void
1460exec_kil(struct tgsi_exec_machine *mach,
1461         const struct tgsi_full_instruction *inst)
1462{
1463   uint uniquemask;
1464   uint chan_index;
1465   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1466   union tgsi_exec_channel r[1];
1467
1468   /* This mask stores component bits that were already tested. Note that
1469    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1470    * tested. */
1471   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1472
1473   for (chan_index = 0; chan_index < 4; chan_index++)
1474   {
1475      uint swizzle;
1476      uint i;
1477
1478      /* unswizzle channel */
1479      swizzle = tgsi_util_get_full_src_register_extswizzle (
1480                        &inst->FullSrcRegisters[0],
1481                        chan_index);
1482
1483      /* check if the component has not been already tested */
1484      if (uniquemask & (1 << swizzle))
1485         continue;
1486      uniquemask |= 1 << swizzle;
1487
1488      FETCH(&r[0], 0, chan_index);
1489      for (i = 0; i < 4; i++)
1490         if (r[0].f[i] < 0.0f)
1491            kilmask |= 1 << i;
1492   }
1493
1494   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1495}
1496
1497/**
1498 * Execute NVIDIA-style KIL which is predicated by a condition code.
1499 * Kill fragment if the condition code is TRUE.
1500 */
1501static void
1502exec_kilp(struct tgsi_exec_machine *mach,
1503          const struct tgsi_full_instruction *inst)
1504{
1505   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1506
1507   if (inst->InstructionExtNv.CondFlowEnable) {
1508      uint swizzle[4];
1509      uint chan_index;
1510
1511      kilmask = 0x0;
1512
1513      swizzle[0] = inst->InstructionExtNv.CondSwizzleX;
1514      swizzle[1] = inst->InstructionExtNv.CondSwizzleY;
1515      swizzle[2] = inst->InstructionExtNv.CondSwizzleZ;
1516      swizzle[3] = inst->InstructionExtNv.CondSwizzleW;
1517
1518      for (chan_index = 0; chan_index < 4; chan_index++)
1519      {
1520         uint i;
1521
1522         for (i = 0; i < 4; i++) {
1523            /* TODO: evaluate the condition code */
1524            if (0)
1525               kilmask |= 1 << i;
1526         }
1527      }
1528   }
1529   else {
1530      /* "unconditional" kil */
1531      kilmask = mach->ExecMask;
1532   }
1533   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1534}
1535
1536
1537/*
1538 * Fetch a texel using STR texture coordinates.
1539 */
1540static void
1541fetch_texel( struct tgsi_sampler *sampler,
1542             const union tgsi_exec_channel *s,
1543             const union tgsi_exec_channel *t,
1544             const union tgsi_exec_channel *p,
1545             float lodbias,  /* XXX should be float[4] */
1546             union tgsi_exec_channel *r,
1547             union tgsi_exec_channel *g,
1548             union tgsi_exec_channel *b,
1549             union tgsi_exec_channel *a )
1550{
1551   uint j;
1552   float rgba[NUM_CHANNELS][QUAD_SIZE];
1553
1554   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
1555
1556   for (j = 0; j < 4; j++) {
1557      r->f[j] = rgba[0][j];
1558      g->f[j] = rgba[1][j];
1559      b->f[j] = rgba[2][j];
1560      a->f[j] = rgba[3][j];
1561   }
1562}
1563
1564
1565static void
1566exec_tex(struct tgsi_exec_machine *mach,
1567         const struct tgsi_full_instruction *inst,
1568         boolean biasLod,
1569         boolean projected)
1570{
1571   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
1572   union tgsi_exec_channel r[8];
1573   uint chan_index;
1574   float lodBias;
1575
1576   /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
1577
1578   switch (inst->InstructionExtTexture.Texture) {
1579   case TGSI_TEXTURE_1D:
1580
1581      FETCH(&r[0], 0, CHAN_X);
1582
1583      if (projected) {
1584         FETCH(&r[1], 0, CHAN_W);
1585         micro_div( &r[0], &r[0], &r[1] );
1586      }
1587
1588      if (biasLod) {
1589         FETCH(&r[1], 0, CHAN_W);
1590         lodBias = r[2].f[0];
1591      }
1592      else
1593         lodBias = 0.0;
1594
1595      fetch_texel(mach->Samplers[unit],
1596                  &r[0], NULL, NULL, lodBias,  /* S, T, P, BIAS */
1597                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
1598      break;
1599
1600   case TGSI_TEXTURE_2D:
1601   case TGSI_TEXTURE_RECT:
1602
1603      FETCH(&r[0], 0, CHAN_X);
1604      FETCH(&r[1], 0, CHAN_Y);
1605      FETCH(&r[2], 0, CHAN_Z);
1606
1607      if (projected) {
1608         FETCH(&r[3], 0, CHAN_W);
1609         micro_div( &r[0], &r[0], &r[3] );
1610         micro_div( &r[1], &r[1], &r[3] );
1611         micro_div( &r[2], &r[2], &r[3] );
1612      }
1613
1614      if (biasLod) {
1615         FETCH(&r[3], 0, CHAN_W);
1616         lodBias = r[3].f[0];
1617      }
1618      else
1619         lodBias = 0.0;
1620
1621      fetch_texel(mach->Samplers[unit],
1622                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
1623                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1624      break;
1625
1626   case TGSI_TEXTURE_3D:
1627   case TGSI_TEXTURE_CUBE:
1628
1629      FETCH(&r[0], 0, CHAN_X);
1630      FETCH(&r[1], 0, CHAN_Y);
1631      FETCH(&r[2], 0, CHAN_Z);
1632
1633      if (projected) {
1634         FETCH(&r[3], 0, CHAN_W);
1635         micro_div( &r[0], &r[0], &r[3] );
1636         micro_div( &r[1], &r[1], &r[3] );
1637         micro_div( &r[2], &r[2], &r[3] );
1638      }
1639
1640      if (biasLod) {
1641         FETCH(&r[3], 0, CHAN_W);
1642         lodBias = r[3].f[0];
1643      }
1644      else
1645         lodBias = 0.0;
1646
1647      fetch_texel(mach->Samplers[unit],
1648                  &r[0], &r[1], &r[2], lodBias,
1649                  &r[0], &r[1], &r[2], &r[3]);
1650      break;
1651
1652   default:
1653      assert (0);
1654   }
1655
1656   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1657      STORE( &r[chan_index], 0, chan_index );
1658   }
1659}
1660
1661
1662/**
1663 * Evaluate a constant-valued coefficient at the position of the
1664 * current quad.
1665 */
1666static void
1667eval_constant_coef(
1668   struct tgsi_exec_machine *mach,
1669   unsigned attrib,
1670   unsigned chan )
1671{
1672   unsigned i;
1673
1674   for( i = 0; i < QUAD_SIZE; i++ ) {
1675      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1676   }
1677}
1678
1679/**
1680 * Evaluate a linear-valued coefficient at the position of the
1681 * current quad.
1682 */
1683static void
1684eval_linear_coef(
1685   struct tgsi_exec_machine *mach,
1686   unsigned attrib,
1687   unsigned chan )
1688{
1689   const float x = mach->QuadPos.xyzw[0].f[0];
1690   const float y = mach->QuadPos.xyzw[1].f[0];
1691   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1692   const float dady = mach->InterpCoefs[attrib].dady[chan];
1693   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1694   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1695   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1696   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1697   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1698}
1699
1700/**
1701 * Evaluate a perspective-valued coefficient at the position of the
1702 * current quad.
1703 */
1704static void
1705eval_perspective_coef(
1706   struct tgsi_exec_machine *mach,
1707   unsigned attrib,
1708   unsigned chan )
1709{
1710   const float x = mach->QuadPos.xyzw[0].f[0];
1711   const float y = mach->QuadPos.xyzw[1].f[0];
1712   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1713   const float dady = mach->InterpCoefs[attrib].dady[chan];
1714   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1715   const float *w = mach->QuadPos.xyzw[3].f;
1716   /* divide by W here */
1717   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1718   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1719   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1720   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1721}
1722
1723
1724typedef void (* eval_coef_func)(
1725   struct tgsi_exec_machine *mach,
1726   unsigned attrib,
1727   unsigned chan );
1728
1729static void
1730exec_declaration(
1731   struct tgsi_exec_machine *mach,
1732   const struct tgsi_full_declaration *decl )
1733{
1734   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
1735      if( decl->Declaration.File == TGSI_FILE_INPUT ) {
1736         unsigned first, last, mask;
1737         eval_coef_func eval;
1738
1739         first = decl->DeclarationRange.First;
1740         last = decl->DeclarationRange.Last;
1741         mask = decl->Declaration.UsageMask;
1742
1743         switch( decl->Declaration.Interpolate ) {
1744         case TGSI_INTERPOLATE_CONSTANT:
1745            eval = eval_constant_coef;
1746            break;
1747
1748         case TGSI_INTERPOLATE_LINEAR:
1749            eval = eval_linear_coef;
1750            break;
1751
1752         case TGSI_INTERPOLATE_PERSPECTIVE:
1753            eval = eval_perspective_coef;
1754            break;
1755
1756         default:
1757            eval = NULL;
1758            assert( 0 );
1759         }
1760
1761         if( mask == TGSI_WRITEMASK_XYZW ) {
1762            unsigned i, j;
1763
1764            for( i = first; i <= last; i++ ) {
1765               for( j = 0; j < NUM_CHANNELS; j++ ) {
1766                  eval( mach, i, j );
1767               }
1768            }
1769         }
1770         else {
1771            unsigned i, j;
1772
1773            for( j = 0; j < NUM_CHANNELS; j++ ) {
1774               if( mask & (1 << j) ) {
1775                  for( i = first; i <= last; i++ ) {
1776                     eval( mach, i, j );
1777                  }
1778               }
1779            }
1780         }
1781      }
1782   }
1783}
1784
1785static void
1786exec_instruction(
1787   struct tgsi_exec_machine *mach,
1788   const struct tgsi_full_instruction *inst,
1789   int *pc )
1790{
1791   uint chan_index;
1792   union tgsi_exec_channel r[8];
1793
1794   (*pc)++;
1795
1796   switch (inst->Instruction.Opcode) {
1797   case TGSI_OPCODE_ARL:
1798      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1799         FETCH( &r[0], 0, chan_index );
1800         micro_trunc( &r[0], &r[0] );
1801         STORE( &r[0], 0, chan_index );
1802      }
1803      break;
1804
1805   case TGSI_OPCODE_MOV:
1806   case TGSI_OPCODE_SWZ:
1807      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1808         FETCH( &r[0], 0, chan_index );
1809         STORE( &r[0], 0, chan_index );
1810      }
1811      break;
1812
1813   case TGSI_OPCODE_LIT:
1814      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1815	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
1816      }
1817
1818      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1819	 FETCH( &r[0], 0, CHAN_X );
1820	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1821	    micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1822	    STORE( &r[0], 0, CHAN_Y );
1823	 }
1824
1825	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1826	    FETCH( &r[1], 0, CHAN_Y );
1827	    micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1828
1829	    FETCH( &r[2], 0, CHAN_W );
1830	    micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
1831	    micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
1832	    micro_pow( &r[1], &r[1], &r[2] );
1833	    micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1834	    STORE( &r[0], 0, CHAN_Z );
1835	 }
1836      }
1837
1838      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1839	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1840      }
1841      break;
1842
1843   case TGSI_OPCODE_RCP:
1844   /* TGSI_OPCODE_RECIP */
1845      FETCH( &r[0], 0, CHAN_X );
1846      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
1847      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1848	 STORE( &r[0], 0, chan_index );
1849      }
1850      break;
1851
1852   case TGSI_OPCODE_RSQ:
1853   /* TGSI_OPCODE_RECIPSQRT */
1854      FETCH( &r[0], 0, CHAN_X );
1855      micro_sqrt( &r[0], &r[0] );
1856      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
1857      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1858	 STORE( &r[0], 0, chan_index );
1859      }
1860      break;
1861
1862   case TGSI_OPCODE_EXP:
1863      FETCH( &r[0], 0, CHAN_X );
1864      micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
1865      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1866         micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
1867         STORE( &r[2], 0, CHAN_X );        /* store r2 */
1868      }
1869      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1870         micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
1871         STORE( &r[2], 0, CHAN_Y );        /* store r2 */
1872      }
1873      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1874         micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
1875         STORE( &r[2], 0, CHAN_Z );        /* store r2 */
1876      }
1877      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1878         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1879      }
1880      break;
1881
1882   case TGSI_OPCODE_LOG:
1883      FETCH( &r[0], 0, CHAN_X );
1884      micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
1885      micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
1886      micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
1887      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1888         STORE( &r[0], 0, CHAN_X );
1889      }
1890      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1891         micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
1892         micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
1893         STORE( &r[0], 0, CHAN_Y );
1894      }
1895      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1896         STORE( &r[1], 0, CHAN_Z );
1897      }
1898      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1899         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1900      }
1901      break;
1902
1903   case TGSI_OPCODE_MUL:
1904      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
1905      {
1906         FETCH(&r[0], 0, chan_index);
1907         FETCH(&r[1], 1, chan_index);
1908
1909         micro_mul( &r[0], &r[0], &r[1] );
1910
1911         STORE(&r[0], 0, chan_index);
1912      }
1913      break;
1914
1915   case TGSI_OPCODE_ADD:
1916      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1917         FETCH( &r[0], 0, chan_index );
1918         FETCH( &r[1], 1, chan_index );
1919         micro_add( &r[0], &r[0], &r[1] );
1920         STORE( &r[0], 0, chan_index );
1921      }
1922      break;
1923
1924   case TGSI_OPCODE_DP3:
1925   /* TGSI_OPCODE_DOT3 */
1926      FETCH( &r[0], 0, CHAN_X );
1927      FETCH( &r[1], 1, CHAN_X );
1928      micro_mul( &r[0], &r[0], &r[1] );
1929
1930      FETCH( &r[1], 0, CHAN_Y );
1931      FETCH( &r[2], 1, CHAN_Y );
1932      micro_mul( &r[1], &r[1], &r[2] );
1933      micro_add( &r[0], &r[0], &r[1] );
1934
1935      FETCH( &r[1], 0, CHAN_Z );
1936      FETCH( &r[2], 1, CHAN_Z );
1937      micro_mul( &r[1], &r[1], &r[2] );
1938      micro_add( &r[0], &r[0], &r[1] );
1939
1940      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1941         STORE( &r[0], 0, chan_index );
1942      }
1943      break;
1944
1945    case TGSI_OPCODE_DP4:
1946    /* TGSI_OPCODE_DOT4 */
1947       FETCH(&r[0], 0, CHAN_X);
1948       FETCH(&r[1], 1, CHAN_X);
1949
1950       micro_mul( &r[0], &r[0], &r[1] );
1951
1952       FETCH(&r[1], 0, CHAN_Y);
1953       FETCH(&r[2], 1, CHAN_Y);
1954
1955       micro_mul( &r[1], &r[1], &r[2] );
1956       micro_add( &r[0], &r[0], &r[1] );
1957
1958       FETCH(&r[1], 0, CHAN_Z);
1959       FETCH(&r[2], 1, CHAN_Z);
1960
1961       micro_mul( &r[1], &r[1], &r[2] );
1962       micro_add( &r[0], &r[0], &r[1] );
1963
1964       FETCH(&r[1], 0, CHAN_W);
1965       FETCH(&r[2], 1, CHAN_W);
1966
1967       micro_mul( &r[1], &r[1], &r[2] );
1968       micro_add( &r[0], &r[0], &r[1] );
1969
1970      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1971	 STORE( &r[0], 0, chan_index );
1972      }
1973      break;
1974
1975   case TGSI_OPCODE_DST:
1976      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1977	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
1978      }
1979
1980      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1981	 FETCH( &r[0], 0, CHAN_Y );
1982	 FETCH( &r[1], 1, CHAN_Y);
1983	 micro_mul( &r[0], &r[0], &r[1] );
1984	 STORE( &r[0], 0, CHAN_Y );
1985      }
1986
1987      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1988	 FETCH( &r[0], 0, CHAN_Z );
1989	 STORE( &r[0], 0, CHAN_Z );
1990      }
1991
1992      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1993	 FETCH( &r[0], 1, CHAN_W );
1994	 STORE( &r[0], 0, CHAN_W );
1995      }
1996      break;
1997
1998   case TGSI_OPCODE_MIN:
1999      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2000         FETCH(&r[0], 0, chan_index);
2001         FETCH(&r[1], 1, chan_index);
2002
2003         /* XXX use micro_min()?? */
2004         micro_lt( &r[0], &r[0], &r[1], &r[0], &r[1] );
2005
2006         STORE(&r[0], 0, chan_index);
2007      }
2008      break;
2009
2010   case TGSI_OPCODE_MAX:
2011      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2012         FETCH(&r[0], 0, chan_index);
2013         FETCH(&r[1], 1, chan_index);
2014
2015         /* XXX use micro_max()?? */
2016         micro_lt( &r[0], &r[0], &r[1], &r[1], &r[0] );
2017
2018         STORE(&r[0], 0, chan_index );
2019      }
2020      break;
2021
2022   case TGSI_OPCODE_SLT:
2023   /* TGSI_OPCODE_SETLT */
2024      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2025         FETCH( &r[0], 0, chan_index );
2026         FETCH( &r[1], 1, chan_index );
2027         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2028         STORE( &r[0], 0, chan_index );
2029      }
2030      break;
2031
2032   case TGSI_OPCODE_SGE:
2033   /* TGSI_OPCODE_SETGE */
2034      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2035         FETCH( &r[0], 0, chan_index );
2036         FETCH( &r[1], 1, chan_index );
2037         micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2038         STORE( &r[0], 0, chan_index );
2039      }
2040      break;
2041
2042   case TGSI_OPCODE_MAD:
2043   /* TGSI_OPCODE_MADD */
2044      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2045         FETCH( &r[0], 0, chan_index );
2046         FETCH( &r[1], 1, chan_index );
2047         micro_mul( &r[0], &r[0], &r[1] );
2048         FETCH( &r[1], 2, chan_index );
2049         micro_add( &r[0], &r[0], &r[1] );
2050         STORE( &r[0], 0, chan_index );
2051      }
2052      break;
2053
2054   case TGSI_OPCODE_SUB:
2055      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2056         FETCH(&r[0], 0, chan_index);
2057         FETCH(&r[1], 1, chan_index);
2058
2059         micro_sub( &r[0], &r[0], &r[1] );
2060
2061         STORE(&r[0], 0, chan_index);
2062      }
2063      break;
2064
2065   case TGSI_OPCODE_LERP:
2066   /* TGSI_OPCODE_LRP */
2067      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2068         FETCH(&r[0], 0, chan_index);
2069         FETCH(&r[1], 1, chan_index);
2070         FETCH(&r[2], 2, chan_index);
2071
2072         micro_sub( &r[1], &r[1], &r[2] );
2073         micro_mul( &r[0], &r[0], &r[1] );
2074         micro_add( &r[0], &r[0], &r[2] );
2075
2076         STORE(&r[0], 0, chan_index);
2077      }
2078      break;
2079
2080   case TGSI_OPCODE_CND:
2081      assert (0);
2082      break;
2083
2084   case TGSI_OPCODE_CND0:
2085      assert (0);
2086      break;
2087
2088   case TGSI_OPCODE_DOT2ADD:
2089      /* TGSI_OPCODE_DP2A */
2090      FETCH( &r[0], 0, CHAN_X );
2091      FETCH( &r[1], 1, CHAN_X );
2092      micro_mul( &r[0], &r[0], &r[1] );
2093
2094      FETCH( &r[1], 0, CHAN_Y );
2095      FETCH( &r[2], 1, CHAN_Y );
2096      micro_mul( &r[1], &r[1], &r[2] );
2097      micro_add( &r[0], &r[0], &r[1] );
2098
2099      FETCH( &r[2], 2, CHAN_X );
2100      micro_add( &r[0], &r[0], &r[2] );
2101
2102      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2103         STORE( &r[0], 0, chan_index );
2104      }
2105      break;
2106
2107   case TGSI_OPCODE_INDEX:
2108      assert (0);
2109      break;
2110
2111   case TGSI_OPCODE_NEGATE:
2112      assert (0);
2113      break;
2114
2115   case TGSI_OPCODE_FRAC:
2116   /* TGSI_OPCODE_FRC */
2117      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2118         FETCH( &r[0], 0, chan_index );
2119         micro_frc( &r[0], &r[0] );
2120         STORE( &r[0], 0, chan_index );
2121      }
2122      break;
2123
2124   case TGSI_OPCODE_CLAMP:
2125      assert (0);
2126      break;
2127
2128   case TGSI_OPCODE_FLOOR:
2129   /* TGSI_OPCODE_FLR */
2130      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2131         FETCH( &r[0], 0, chan_index );
2132         micro_flr( &r[0], &r[0] );
2133         STORE( &r[0], 0, chan_index );
2134      }
2135      break;
2136
2137   case TGSI_OPCODE_ROUND:
2138   case TGSI_OPCODE_ARR:
2139      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2140         FETCH( &r[0], 0, chan_index );
2141         micro_rnd( &r[0], &r[0] );
2142         STORE( &r[0], 0, chan_index );
2143      }
2144      break;
2145
2146   case TGSI_OPCODE_EXPBASE2:
2147    /* TGSI_OPCODE_EX2 */
2148      FETCH(&r[0], 0, CHAN_X);
2149
2150#if FAST_MATH
2151      micro_exp2( &r[0], &r[0] );
2152#else
2153      micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] );
2154#endif
2155
2156      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2157	 STORE( &r[0], 0, chan_index );
2158      }
2159      break;
2160
2161   case TGSI_OPCODE_LOGBASE2:
2162   /* TGSI_OPCODE_LG2 */
2163      FETCH( &r[0], 0, CHAN_X );
2164      micro_lg2( &r[0], &r[0] );
2165      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2166         STORE( &r[0], 0, chan_index );
2167      }
2168      break;
2169
2170   case TGSI_OPCODE_POWER:
2171      /* TGSI_OPCODE_POW */
2172      FETCH(&r[0], 0, CHAN_X);
2173      FETCH(&r[1], 1, CHAN_X);
2174
2175      micro_pow( &r[0], &r[0], &r[1] );
2176
2177      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2178	 STORE( &r[0], 0, chan_index );
2179      }
2180      break;
2181
2182   case TGSI_OPCODE_CROSSPRODUCT:
2183      /* TGSI_OPCODE_XPD */
2184      FETCH(&r[0], 0, CHAN_Y);
2185      FETCH(&r[1], 1, CHAN_Z);
2186
2187      micro_mul( &r[2], &r[0], &r[1] );
2188
2189      FETCH(&r[3], 0, CHAN_Z);
2190      FETCH(&r[4], 1, CHAN_Y);
2191
2192      micro_mul( &r[5], &r[3], &r[4] );
2193      micro_sub( &r[2], &r[2], &r[5] );
2194
2195      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2196         STORE( &r[2], 0, CHAN_X );
2197      }
2198
2199      FETCH(&r[2], 1, CHAN_X);
2200
2201      micro_mul( &r[3], &r[3], &r[2] );
2202
2203      FETCH(&r[5], 0, CHAN_X);
2204
2205      micro_mul( &r[1], &r[1], &r[5] );
2206      micro_sub( &r[3], &r[3], &r[1] );
2207
2208      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2209         STORE( &r[3], 0, CHAN_Y );
2210      }
2211
2212      micro_mul( &r[5], &r[5], &r[4] );
2213      micro_mul( &r[0], &r[0], &r[2] );
2214      micro_sub( &r[5], &r[5], &r[0] );
2215
2216      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2217         STORE( &r[5], 0, CHAN_Z );
2218      }
2219
2220      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2221         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2222      }
2223      break;
2224
2225    case TGSI_OPCODE_MULTIPLYMATRIX:
2226       assert (0);
2227       break;
2228
2229    case TGSI_OPCODE_ABS:
2230       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2231          FETCH(&r[0], 0, chan_index);
2232
2233          micro_abs( &r[0], &r[0] );
2234
2235          STORE(&r[0], 0, chan_index);
2236       }
2237       break;
2238
2239   case TGSI_OPCODE_RCC:
2240      assert (0);
2241      break;
2242
2243   case TGSI_OPCODE_DPH:
2244      FETCH(&r[0], 0, CHAN_X);
2245      FETCH(&r[1], 1, CHAN_X);
2246
2247      micro_mul( &r[0], &r[0], &r[1] );
2248
2249      FETCH(&r[1], 0, CHAN_Y);
2250      FETCH(&r[2], 1, CHAN_Y);
2251
2252      micro_mul( &r[1], &r[1], &r[2] );
2253      micro_add( &r[0], &r[0], &r[1] );
2254
2255      FETCH(&r[1], 0, CHAN_Z);
2256      FETCH(&r[2], 1, CHAN_Z);
2257
2258      micro_mul( &r[1], &r[1], &r[2] );
2259      micro_add( &r[0], &r[0], &r[1] );
2260
2261      FETCH(&r[1], 1, CHAN_W);
2262
2263      micro_add( &r[0], &r[0], &r[1] );
2264
2265      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2266	 STORE( &r[0], 0, chan_index );
2267      }
2268      break;
2269
2270   case TGSI_OPCODE_COS:
2271      FETCH(&r[0], 0, CHAN_X);
2272
2273      micro_cos( &r[0], &r[0] );
2274
2275      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2276	 STORE( &r[0], 0, chan_index );
2277      }
2278      break;
2279
2280   case TGSI_OPCODE_DDX:
2281      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2282         FETCH( &r[0], 0, chan_index );
2283         micro_ddx( &r[0], &r[0] );
2284         STORE( &r[0], 0, chan_index );
2285      }
2286      break;
2287
2288   case TGSI_OPCODE_DDY:
2289      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2290         FETCH( &r[0], 0, chan_index );
2291         micro_ddy( &r[0], &r[0] );
2292         STORE( &r[0], 0, chan_index );
2293      }
2294      break;
2295
2296   case TGSI_OPCODE_KILP:
2297      exec_kilp (mach, inst);
2298      break;
2299
2300   case TGSI_OPCODE_KIL:
2301      exec_kil (mach, inst);
2302      break;
2303
2304   case TGSI_OPCODE_PK2H:
2305      assert (0);
2306      break;
2307
2308   case TGSI_OPCODE_PK2US:
2309      assert (0);
2310      break;
2311
2312   case TGSI_OPCODE_PK4B:
2313      assert (0);
2314      break;
2315
2316   case TGSI_OPCODE_PK4UB:
2317      assert (0);
2318      break;
2319
2320   case TGSI_OPCODE_RFL:
2321      assert (0);
2322      break;
2323
2324   case TGSI_OPCODE_SEQ:
2325      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2326         FETCH( &r[0], 0, chan_index );
2327         FETCH( &r[1], 1, chan_index );
2328         micro_eq( &r[0], &r[0], &r[1],
2329                   &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C],
2330                   &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2331         STORE( &r[0], 0, chan_index );
2332      }
2333      break;
2334
2335   case TGSI_OPCODE_SFL:
2336      assert (0);
2337      break;
2338
2339   case TGSI_OPCODE_SGT:
2340      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2341         FETCH( &r[0], 0, chan_index );
2342         FETCH( &r[1], 1, chan_index );
2343         micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
2344         STORE( &r[0], 0, chan_index );
2345      }
2346      break;
2347
2348   case TGSI_OPCODE_SIN:
2349      FETCH( &r[0], 0, CHAN_X );
2350      micro_sin( &r[0], &r[0] );
2351      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2352         STORE( &r[0], 0, chan_index );
2353      }
2354      break;
2355
2356   case TGSI_OPCODE_SLE:
2357      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2358         FETCH( &r[0], 0, chan_index );
2359         FETCH( &r[1], 1, chan_index );
2360         micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2361         STORE( &r[0], 0, chan_index );
2362      }
2363      break;
2364
2365   case TGSI_OPCODE_SNE:
2366      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2367         FETCH( &r[0], 0, chan_index );
2368         FETCH( &r[1], 1, chan_index );
2369         micro_eq( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
2370         STORE( &r[0], 0, chan_index );
2371      }
2372      break;
2373
2374   case TGSI_OPCODE_STR:
2375      assert (0);
2376      break;
2377
2378   case TGSI_OPCODE_TEX:
2379      /* simple texture lookup */
2380      /* src[0] = texcoord */
2381      /* src[1] = sampler unit */
2382      exec_tex(mach, inst, FALSE, FALSE);
2383      break;
2384
2385   case TGSI_OPCODE_TXB:
2386      /* Texture lookup with lod bias */
2387      /* src[0] = texcoord (src[0].w = LOD bias) */
2388      /* src[1] = sampler unit */
2389      exec_tex(mach, inst, TRUE, FALSE);
2390      break;
2391
2392   case TGSI_OPCODE_TXD:
2393      /* Texture lookup with explict partial derivatives */
2394      /* src[0] = texcoord */
2395      /* src[1] = d[strq]/dx */
2396      /* src[2] = d[strq]/dy */
2397      /* src[3] = sampler unit */
2398      assert (0);
2399      break;
2400
2401   case TGSI_OPCODE_TXL:
2402      /* Texture lookup with explit LOD */
2403      /* src[0] = texcoord (src[0].w = LOD) */
2404      /* src[1] = sampler unit */
2405      exec_tex(mach, inst, TRUE, FALSE);
2406      break;
2407
2408   case TGSI_OPCODE_TXP:
2409      /* Texture lookup with projection */
2410      /* src[0] = texcoord (src[0].w = projection) */
2411      /* src[1] = sampler unit */
2412      exec_tex(mach, inst, FALSE, TRUE);
2413      break;
2414
2415   case TGSI_OPCODE_UP2H:
2416      assert (0);
2417      break;
2418
2419   case TGSI_OPCODE_UP2US:
2420      assert (0);
2421      break;
2422
2423   case TGSI_OPCODE_UP4B:
2424      assert (0);
2425      break;
2426
2427   case TGSI_OPCODE_UP4UB:
2428      assert (0);
2429      break;
2430
2431   case TGSI_OPCODE_X2D:
2432      assert (0);
2433      break;
2434
2435   case TGSI_OPCODE_ARA:
2436      assert (0);
2437      break;
2438
2439   case TGSI_OPCODE_BRA:
2440      assert (0);
2441      break;
2442
2443   case TGSI_OPCODE_CAL:
2444      /* skip the call if no execution channels are enabled */
2445      if (mach->ExecMask) {
2446         /* do the call */
2447
2448         /* push the Cond, Loop, Cont stacks */
2449         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2450         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2451         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2452         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2453         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2454         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2455
2456         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
2457         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
2458
2459         /* note that PC was already incremented above */
2460         mach->CallStack[mach->CallStackTop++] = *pc;
2461         *pc = inst->InstructionExtLabel.Label;
2462      }
2463      break;
2464
2465   case TGSI_OPCODE_RET:
2466      mach->FuncMask &= ~mach->ExecMask;
2467      UPDATE_EXEC_MASK(mach);
2468
2469      if (mach->FuncMask == 0x0) {
2470         /* really return now (otherwise, keep executing */
2471
2472         if (mach->CallStackTop == 0) {
2473            /* returning from main() */
2474            *pc = -1;
2475            return;
2476         }
2477         *pc = mach->CallStack[--mach->CallStackTop];
2478
2479         /* pop the Cond, Loop, Cont stacks */
2480         assert(mach->CondStackTop > 0);
2481         mach->CondMask = mach->CondStack[--mach->CondStackTop];
2482         assert(mach->LoopStackTop > 0);
2483         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
2484         assert(mach->ContStackTop > 0);
2485         mach->ContMask = mach->ContStack[--mach->ContStackTop];
2486         assert(mach->FuncStackTop > 0);
2487         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
2488
2489         UPDATE_EXEC_MASK(mach);
2490      }
2491      break;
2492
2493   case TGSI_OPCODE_SSG:
2494   /* TGSI_OPCODE_SGN */
2495      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2496         FETCH( &r[0], 0, chan_index );
2497         micro_sgn( &r[0], &r[0] );
2498         STORE( &r[0], 0, chan_index );
2499      }
2500      break;
2501
2502   case TGSI_OPCODE_CMP:
2503      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2504         FETCH(&r[0], 0, chan_index);
2505         FETCH(&r[1], 1, chan_index);
2506         FETCH(&r[2], 2, chan_index);
2507
2508         micro_lt( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2] );
2509
2510         STORE(&r[0], 0, chan_index);
2511      }
2512      break;
2513
2514   case TGSI_OPCODE_SCS:
2515      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2516         FETCH( &r[0], 0, CHAN_X );
2517      }
2518      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
2519         micro_cos( &r[1], &r[0] );
2520         STORE( &r[1], 0, CHAN_X );
2521      }
2522      if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2523         micro_sin( &r[1], &r[0] );
2524         STORE( &r[1], 0, CHAN_Y );
2525      }
2526      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2527         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
2528      }
2529      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
2530         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2531      }
2532      break;
2533
2534   case TGSI_OPCODE_NRM:
2535      /* 3-component vector normalize */
2536      {
2537         union tgsi_exec_channel tmp, dot;
2538
2539         /* tmp = dp3(src0, src0): */
2540         FETCH( &r[0], 0, CHAN_X );
2541         micro_mul( &tmp, &r[0], &r[0] );
2542
2543         FETCH( &r[1], 0, CHAN_Y );
2544         micro_mul( &dot, &r[1], &r[1] );
2545         micro_add( &tmp, &tmp, &dot );
2546
2547         FETCH( &r[2], 0, CHAN_Z );
2548         micro_mul( &dot, &r[2], &r[2] );
2549         micro_add( &tmp, &tmp, &dot );
2550
2551         /* tmp = 1 / sqrt(tmp) */
2552         micro_sqrt( &tmp, &tmp );
2553         micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
2554
2555         /* note: w channel is undefined */
2556         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2557            /* chan = chan * tmp */
2558            micro_mul( &r[chan_index], &tmp, &r[chan_index] );
2559            STORE( &r[chan_index], 0, chan_index );
2560         }
2561      }
2562      break;
2563
2564   case TGSI_OPCODE_NRM4:
2565      /* 4-component vector normalize */
2566      {
2567         union tgsi_exec_channel tmp, dot;
2568
2569         /* tmp = dp4(src0, src0): */
2570         FETCH( &r[0], 0, CHAN_X );
2571         micro_mul( &tmp, &r[0], &r[0] );
2572
2573         FETCH( &r[1], 0, CHAN_Y );
2574         micro_mul( &dot, &r[1], &r[1] );
2575         micro_add( &tmp, &tmp, &dot );
2576
2577         FETCH( &r[2], 0, CHAN_Z );
2578         micro_mul( &dot, &r[2], &r[2] );
2579         micro_add( &tmp, &tmp, &dot );
2580
2581         FETCH( &r[3], 0, CHAN_W );
2582         micro_mul( &dot, &r[3], &r[3] );
2583         micro_add( &tmp, &tmp, &dot );
2584
2585         /* tmp = 1 / sqrt(tmp) */
2586         micro_sqrt( &tmp, &tmp );
2587         micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
2588
2589         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2590            /* chan = chan * tmp */
2591            micro_mul( &r[chan_index], &tmp, &r[chan_index] );
2592            STORE( &r[chan_index], 0, chan_index );
2593         }
2594      }
2595      break;
2596
2597   case TGSI_OPCODE_DIV:
2598      assert( 0 );
2599      break;
2600
2601   case TGSI_OPCODE_DP2:
2602      FETCH( &r[0], 0, CHAN_X );
2603      FETCH( &r[1], 1, CHAN_X );
2604      micro_mul( &r[0], &r[0], &r[1] );
2605
2606      FETCH( &r[1], 0, CHAN_Y );
2607      FETCH( &r[2], 1, CHAN_Y );
2608      micro_mul( &r[1], &r[1], &r[2] );
2609      micro_add( &r[0], &r[0], &r[1] );
2610
2611      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2612         STORE( &r[0], 0, chan_index );
2613      }
2614      break;
2615
2616   case TGSI_OPCODE_IF:
2617      /* push CondMask */
2618      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2619      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2620      FETCH( &r[0], 0, CHAN_X );
2621      /* update CondMask */
2622      if( ! r[0].u[0] ) {
2623         mach->CondMask &= ~0x1;
2624      }
2625      if( ! r[0].u[1] ) {
2626         mach->CondMask &= ~0x2;
2627      }
2628      if( ! r[0].u[2] ) {
2629         mach->CondMask &= ~0x4;
2630      }
2631      if( ! r[0].u[3] ) {
2632         mach->CondMask &= ~0x8;
2633      }
2634      UPDATE_EXEC_MASK(mach);
2635      /* Todo: If CondMask==0, jump to ELSE */
2636      break;
2637
2638   case TGSI_OPCODE_ELSE:
2639      /* invert CondMask wrt previous mask */
2640      {
2641         uint prevMask;
2642         assert(mach->CondStackTop > 0);
2643         prevMask = mach->CondStack[mach->CondStackTop - 1];
2644         mach->CondMask = ~mach->CondMask & prevMask;
2645         UPDATE_EXEC_MASK(mach);
2646         /* Todo: If CondMask==0, jump to ENDIF */
2647      }
2648      break;
2649
2650   case TGSI_OPCODE_ENDIF:
2651      /* pop CondMask */
2652      assert(mach->CondStackTop > 0);
2653      mach->CondMask = mach->CondStack[--mach->CondStackTop];
2654      UPDATE_EXEC_MASK(mach);
2655      break;
2656
2657   case TGSI_OPCODE_END:
2658      /* halt execution */
2659      *pc = -1;
2660      break;
2661
2662   case TGSI_OPCODE_REP:
2663      assert (0);
2664      break;
2665
2666   case TGSI_OPCODE_ENDREP:
2667       assert (0);
2668       break;
2669
2670   case TGSI_OPCODE_PUSHA:
2671      assert (0);
2672      break;
2673
2674   case TGSI_OPCODE_POPA:
2675      assert (0);
2676      break;
2677
2678   case TGSI_OPCODE_CEIL:
2679      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2680         FETCH( &r[0], 0, chan_index );
2681         micro_ceil( &r[0], &r[0] );
2682         STORE( &r[0], 0, chan_index );
2683      }
2684      break;
2685
2686   case TGSI_OPCODE_I2F:
2687      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2688         FETCH( &r[0], 0, chan_index );
2689         micro_i2f( &r[0], &r[0] );
2690         STORE( &r[0], 0, chan_index );
2691      }
2692      break;
2693
2694   case TGSI_OPCODE_NOT:
2695      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2696         FETCH( &r[0], 0, chan_index );
2697         micro_not( &r[0], &r[0] );
2698         STORE( &r[0], 0, chan_index );
2699      }
2700      break;
2701
2702   case TGSI_OPCODE_TRUNC:
2703      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2704         FETCH( &r[0], 0, chan_index );
2705         micro_trunc( &r[0], &r[0] );
2706         STORE( &r[0], 0, chan_index );
2707      }
2708      break;
2709
2710   case TGSI_OPCODE_SHL:
2711      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2712         FETCH( &r[0], 0, chan_index );
2713         FETCH( &r[1], 1, chan_index );
2714         micro_shl( &r[0], &r[0], &r[1] );
2715         STORE( &r[0], 0, chan_index );
2716      }
2717      break;
2718
2719   case TGSI_OPCODE_SHR:
2720      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2721         FETCH( &r[0], 0, chan_index );
2722         FETCH( &r[1], 1, chan_index );
2723         micro_ishr( &r[0], &r[0], &r[1] );
2724         STORE( &r[0], 0, chan_index );
2725      }
2726      break;
2727
2728   case TGSI_OPCODE_AND:
2729      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2730         FETCH( &r[0], 0, chan_index );
2731         FETCH( &r[1], 1, chan_index );
2732         micro_and( &r[0], &r[0], &r[1] );
2733         STORE( &r[0], 0, chan_index );
2734      }
2735      break;
2736
2737   case TGSI_OPCODE_OR:
2738      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2739         FETCH( &r[0], 0, chan_index );
2740         FETCH( &r[1], 1, chan_index );
2741         micro_or( &r[0], &r[0], &r[1] );
2742         STORE( &r[0], 0, chan_index );
2743      }
2744      break;
2745
2746   case TGSI_OPCODE_MOD:
2747      assert (0);
2748      break;
2749
2750   case TGSI_OPCODE_XOR:
2751      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2752         FETCH( &r[0], 0, chan_index );
2753         FETCH( &r[1], 1, chan_index );
2754         micro_xor( &r[0], &r[0], &r[1] );
2755         STORE( &r[0], 0, chan_index );
2756      }
2757      break;
2758
2759   case TGSI_OPCODE_SAD:
2760      assert (0);
2761      break;
2762
2763   case TGSI_OPCODE_TXF:
2764      assert (0);
2765      break;
2766
2767   case TGSI_OPCODE_TXQ:
2768      assert (0);
2769      break;
2770
2771   case TGSI_OPCODE_EMIT:
2772      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
2773      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
2774      break;
2775
2776   case TGSI_OPCODE_ENDPRIM:
2777      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
2778      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
2779      break;
2780
2781   case TGSI_OPCODE_LOOP:
2782      /* fall-through (for now) */
2783   case TGSI_OPCODE_BGNLOOP2:
2784      /* push LoopMask and ContMasks */
2785      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2786      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2787      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2788      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2789      break;
2790
2791   case TGSI_OPCODE_ENDLOOP:
2792      /* fall-through (for now at least) */
2793   case TGSI_OPCODE_ENDLOOP2:
2794      /* Restore ContMask, but don't pop */
2795      assert(mach->ContStackTop > 0);
2796      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
2797      UPDATE_EXEC_MASK(mach);
2798      if (mach->ExecMask) {
2799         /* repeat loop: jump to instruction just past BGNLOOP */
2800         *pc = inst->InstructionExtLabel.Label + 1;
2801      }
2802      else {
2803         /* exit loop: pop LoopMask */
2804         assert(mach->LoopStackTop > 0);
2805         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
2806         /* pop ContMask */
2807         assert(mach->ContStackTop > 0);
2808         mach->ContMask = mach->ContStack[--mach->ContStackTop];
2809      }
2810      UPDATE_EXEC_MASK(mach);
2811      break;
2812
2813   case TGSI_OPCODE_BRK:
2814      /* turn off loop channels for each enabled exec channel */
2815      mach->LoopMask &= ~mach->ExecMask;
2816      /* Todo: if mach->LoopMask == 0, jump to end of loop */
2817      UPDATE_EXEC_MASK(mach);
2818      break;
2819
2820   case TGSI_OPCODE_CONT:
2821      /* turn off cont channels for each enabled exec channel */
2822      mach->ContMask &= ~mach->ExecMask;
2823      /* Todo: if mach->LoopMask == 0, jump to end of loop */
2824      UPDATE_EXEC_MASK(mach);
2825      break;
2826
2827   case TGSI_OPCODE_BGNSUB:
2828      /* no-op */
2829      break;
2830
2831   case TGSI_OPCODE_ENDSUB:
2832      /* no-op */
2833      break;
2834
2835   case TGSI_OPCODE_NOISE1:
2836      assert( 0 );
2837      break;
2838
2839   case TGSI_OPCODE_NOISE2:
2840      assert( 0 );
2841      break;
2842
2843   case TGSI_OPCODE_NOISE3:
2844      assert( 0 );
2845      break;
2846
2847   case TGSI_OPCODE_NOISE4:
2848      assert( 0 );
2849      break;
2850
2851   case TGSI_OPCODE_NOP:
2852      break;
2853
2854   default:
2855      assert( 0 );
2856   }
2857}
2858
2859
2860/**
2861 * Run TGSI interpreter.
2862 * \return bitmask of "alive" quad components
2863 */
2864uint
2865tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
2866{
2867   uint i;
2868   int pc = 0;
2869
2870   mach->CondMask = 0xf;
2871   mach->LoopMask = 0xf;
2872   mach->ContMask = 0xf;
2873   mach->FuncMask = 0xf;
2874   mach->ExecMask = 0xf;
2875
2876   mach->CondStackTop = 0; /* temporarily subvert this assertion */
2877   assert(mach->CondStackTop == 0);
2878   assert(mach->LoopStackTop == 0);
2879   assert(mach->ContStackTop == 0);
2880   assert(mach->CallStackTop == 0);
2881
2882   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
2883   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
2884
2885   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
2886      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
2887      mach->Primitives[0] = 0;
2888   }
2889
2890   for (i = 0; i < QUAD_SIZE; i++) {
2891      mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
2892         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
2893         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
2894         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
2895         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
2896   }
2897
2898   /* execute declarations (interpolants) */
2899   for (i = 0; i < mach->NumDeclarations; i++) {
2900      exec_declaration( mach, mach->Declarations+i );
2901   }
2902
2903   /* execute instructions, until pc is set to -1 */
2904   while (pc != -1) {
2905      assert(pc < (int) mach->NumInstructions);
2906      exec_instruction( mach, mach->Instructions + pc, &pc );
2907   }
2908
2909#if 0
2910   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
2911   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
2912      /*
2913       * Scale back depth component.
2914       */
2915      for (i = 0; i < 4; i++)
2916         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
2917   }
2918#endif
2919
2920   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
2921}
2922
2923
2924