tgsi_exec.c revision 4d710dd3cf3187e94e5765b46e4dd6899a7a41d6
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28/**
29 * TGSI interpreter/executor.
30 *
31 * Flow control information:
32 *
33 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
34 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
35 * care since a condition may be true for some quad components but false
36 * for other components.
37 *
38 * We basically execute all statements (even if they're in the part of
39 * an IF/ELSE clause that's "not taken") and use a special mask to
40 * control writing to destination registers.  This is the ExecMask.
41 * See store_dest().
42 *
43 * The ExecMask is computed from three other masks (CondMask, LoopMask and
44 * ContMask) which are controlled by the flow control instructions (namely:
45 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
46 *
47 *
48 * Authors:
49 *   Michal Krol
50 *   Brian Paul
51 */
52
53#include "pipe/p_compiler.h"
54#include "pipe/p_state.h"
55#include "pipe/p_shader_tokens.h"
56#include "tgsi/tgsi_parse.h"
57#include "tgsi/tgsi_util.h"
58#include "tgsi_exec.h"
59#include "util/u_memory.h"
60#include "util/u_math.h"
61
62#define FAST_MATH 1
63
64#define TILE_TOP_LEFT     0
65#define TILE_TOP_RIGHT    1
66#define TILE_BOTTOM_LEFT  2
67#define TILE_BOTTOM_RIGHT 3
68
69#define CHAN_X  0
70#define CHAN_Y  1
71#define CHAN_Z  2
72#define CHAN_W  3
73
74/*
75 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
76 */
77#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
78#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
79#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
80#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
81#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
82#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
83#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
84#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
85#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
86#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
87#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
88#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
89#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
90#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
91#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
92#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
93#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
94#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
95#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
96#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
97#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
98#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
99#define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
100#define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
101#define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
102#define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
103#define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
104#define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
105#define TEMP_R0            TGSI_EXEC_TEMP_R0
106
107#define IS_CHANNEL_ENABLED(INST, CHAN)\
108   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
109
110#define IS_CHANNEL_ENABLED2(INST, CHAN)\
111   ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
112
113#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
114   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
115      if (IS_CHANNEL_ENABLED( INST, CHAN ))
116
117#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
118   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
119      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
120
121
122/** The execution mask depends on the conditional mask and the loop mask */
123#define UPDATE_EXEC_MASK(MACH) \
124      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
125
126/**
127 * Initialize machine state by expanding tokens to full instructions,
128 * allocating temporary storage, setting up constants, etc.
129 * After this, we can call tgsi_exec_machine_run() many times.
130 */
131void
132tgsi_exec_machine_bind_shader(
133   struct tgsi_exec_machine *mach,
134   const struct tgsi_token *tokens,
135   uint numSamplers,
136   struct tgsi_sampler **samplers)
137{
138   uint k;
139   struct tgsi_parse_context parse;
140   struct tgsi_exec_labels *labels = &mach->Labels;
141   struct tgsi_full_instruction *instructions;
142   struct tgsi_full_declaration *declarations;
143   uint maxInstructions = 10, numInstructions = 0;
144   uint maxDeclarations = 10, numDeclarations = 0;
145   uint instno = 0;
146
147#if 0
148   tgsi_dump(tokens, 0);
149#endif
150
151   util_init_math();
152
153   mach->Tokens = tokens;
154   mach->Samplers = samplers;
155
156   k = tgsi_parse_init (&parse, mach->Tokens);
157   if (k != TGSI_PARSE_OK) {
158      debug_printf( "Problem parsing!\n" );
159      return;
160   }
161
162   mach->Processor = parse.FullHeader.Processor.Processor;
163   mach->ImmLimit = 0;
164   labels->count = 0;
165
166   declarations = (struct tgsi_full_declaration *)
167      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
168
169   if (!declarations) {
170      return;
171   }
172
173   instructions = (struct tgsi_full_instruction *)
174      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
175
176   if (!instructions) {
177      FREE( declarations );
178      return;
179   }
180
181   while( !tgsi_parse_end_of_tokens( &parse ) ) {
182      uint pointer = parse.Position;
183      uint i;
184
185      tgsi_parse_token( &parse );
186      switch( parse.FullToken.Token.Type ) {
187      case TGSI_TOKEN_TYPE_DECLARATION:
188         /* save expanded declaration */
189         if (numDeclarations == maxDeclarations) {
190            declarations = REALLOC(declarations,
191                                   maxDeclarations
192                                   * sizeof(struct tgsi_full_declaration),
193                                   (maxDeclarations + 10)
194                                   * sizeof(struct tgsi_full_declaration));
195            maxDeclarations += 10;
196         }
197         memcpy(declarations + numDeclarations,
198                &parse.FullToken.FullDeclaration,
199                sizeof(declarations[0]));
200         numDeclarations++;
201         break;
202
203      case TGSI_TOKEN_TYPE_IMMEDIATE:
204         {
205            uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
206            assert( size % 4 == 0 );
207            assert( mach->ImmLimit + size / 4 <= TGSI_EXEC_NUM_IMMEDIATES );
208
209            for( i = 0; i < size; i++ ) {
210               mach->Imms[mach->ImmLimit + i / 4][i % 4] =
211		  parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
212            }
213            mach->ImmLimit += size / 4;
214         }
215         break;
216
217      case TGSI_TOKEN_TYPE_INSTRUCTION:
218         assert( labels->count < MAX_LABELS );
219
220         labels->labels[labels->count][0] = instno;
221         labels->labels[labels->count][1] = pointer;
222         labels->count++;
223
224         /* save expanded instruction */
225         if (numInstructions == maxInstructions) {
226            instructions = REALLOC(instructions,
227                                   maxInstructions
228                                   * sizeof(struct tgsi_full_instruction),
229                                   (maxInstructions + 10)
230                                   * sizeof(struct tgsi_full_instruction));
231            maxInstructions += 10;
232         }
233         memcpy(instructions + numInstructions,
234                &parse.FullToken.FullInstruction,
235                sizeof(instructions[0]));
236         numInstructions++;
237         break;
238
239      default:
240         assert( 0 );
241      }
242   }
243   tgsi_parse_free (&parse);
244
245   if (mach->Declarations) {
246      FREE( mach->Declarations );
247   }
248   mach->Declarations = declarations;
249   mach->NumDeclarations = numDeclarations;
250
251   if (mach->Instructions) {
252      FREE( mach->Instructions );
253   }
254   mach->Instructions = instructions;
255   mach->NumInstructions = numInstructions;
256}
257
258
259void
260tgsi_exec_machine_init(
261   struct tgsi_exec_machine *mach )
262{
263   uint i;
264
265   mach->Temps = (struct tgsi_exec_vector *) tgsi_align_128bit( mach->_Temps);
266   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
267
268   /* Setup constants. */
269   for( i = 0; i < 4; i++ ) {
270      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
271      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
272      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
273      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
274      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
275      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
276      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
277      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
278      mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
279      mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
280   }
281}
282
283
284void
285tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach)
286{
287   if (mach->Instructions) {
288      FREE(mach->Instructions);
289      mach->Instructions = NULL;
290      mach->NumInstructions = 0;
291   }
292   if (mach->Declarations) {
293      FREE(mach->Declarations);
294      mach->Declarations = NULL;
295      mach->NumDeclarations = 0;
296   }
297}
298
299
300static void
301micro_abs(
302   union tgsi_exec_channel *dst,
303   const union tgsi_exec_channel *src )
304{
305   dst->f[0] = fabsf( src->f[0] );
306   dst->f[1] = fabsf( src->f[1] );
307   dst->f[2] = fabsf( src->f[2] );
308   dst->f[3] = fabsf( src->f[3] );
309}
310
311static void
312micro_add(
313   union tgsi_exec_channel *dst,
314   const union tgsi_exec_channel *src0,
315   const union tgsi_exec_channel *src1 )
316{
317   dst->f[0] = src0->f[0] + src1->f[0];
318   dst->f[1] = src0->f[1] + src1->f[1];
319   dst->f[2] = src0->f[2] + src1->f[2];
320   dst->f[3] = src0->f[3] + src1->f[3];
321}
322
323#if 0
324static void
325micro_iadd(
326   union tgsi_exec_channel *dst,
327   const union tgsi_exec_channel *src0,
328   const union tgsi_exec_channel *src1 )
329{
330   dst->i[0] = src0->i[0] + src1->i[0];
331   dst->i[1] = src0->i[1] + src1->i[1];
332   dst->i[2] = src0->i[2] + src1->i[2];
333   dst->i[3] = src0->i[3] + src1->i[3];
334}
335#endif
336
337static void
338micro_and(
339   union tgsi_exec_channel *dst,
340   const union tgsi_exec_channel *src0,
341   const union tgsi_exec_channel *src1 )
342{
343   dst->u[0] = src0->u[0] & src1->u[0];
344   dst->u[1] = src0->u[1] & src1->u[1];
345   dst->u[2] = src0->u[2] & src1->u[2];
346   dst->u[3] = src0->u[3] & src1->u[3];
347}
348
349static void
350micro_ceil(
351   union tgsi_exec_channel *dst,
352   const union tgsi_exec_channel *src )
353{
354   dst->f[0] = ceilf( src->f[0] );
355   dst->f[1] = ceilf( src->f[1] );
356   dst->f[2] = ceilf( src->f[2] );
357   dst->f[3] = ceilf( src->f[3] );
358}
359
360static void
361micro_cos(
362   union tgsi_exec_channel *dst,
363   const union tgsi_exec_channel *src )
364{
365   dst->f[0] = cosf( src->f[0] );
366   dst->f[1] = cosf( src->f[1] );
367   dst->f[2] = cosf( src->f[2] );
368   dst->f[3] = cosf( src->f[3] );
369}
370
371static void
372micro_ddx(
373   union tgsi_exec_channel *dst,
374   const union tgsi_exec_channel *src )
375{
376   dst->f[0] =
377   dst->f[1] =
378   dst->f[2] =
379   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
380}
381
382static void
383micro_ddy(
384   union tgsi_exec_channel *dst,
385   const union tgsi_exec_channel *src )
386{
387   dst->f[0] =
388   dst->f[1] =
389   dst->f[2] =
390   dst->f[3] = src->f[TILE_TOP_LEFT] - src->f[TILE_BOTTOM_LEFT];
391}
392
393static void
394micro_div(
395   union tgsi_exec_channel *dst,
396   const union tgsi_exec_channel *src0,
397   const union tgsi_exec_channel *src1 )
398{
399   if (src1->f[0] != 0) {
400      dst->f[0] = src0->f[0] / src1->f[0];
401   }
402   if (src1->f[1] != 0) {
403      dst->f[1] = src0->f[1] / src1->f[1];
404   }
405   if (src1->f[2] != 0) {
406      dst->f[2] = src0->f[2] / src1->f[2];
407   }
408   if (src1->f[3] != 0) {
409      dst->f[3] = src0->f[3] / src1->f[3];
410   }
411}
412
413#if 0
414static void
415micro_udiv(
416   union tgsi_exec_channel *dst,
417   const union tgsi_exec_channel *src0,
418   const union tgsi_exec_channel *src1 )
419{
420   dst->u[0] = src0->u[0] / src1->u[0];
421   dst->u[1] = src0->u[1] / src1->u[1];
422   dst->u[2] = src0->u[2] / src1->u[2];
423   dst->u[3] = src0->u[3] / src1->u[3];
424}
425#endif
426
427static void
428micro_eq(
429   union tgsi_exec_channel *dst,
430   const union tgsi_exec_channel *src0,
431   const union tgsi_exec_channel *src1,
432   const union tgsi_exec_channel *src2,
433   const union tgsi_exec_channel *src3 )
434{
435   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
436   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
437   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
438   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
439}
440
441#if 0
442static void
443micro_ieq(
444   union tgsi_exec_channel *dst,
445   const union tgsi_exec_channel *src0,
446   const union tgsi_exec_channel *src1,
447   const union tgsi_exec_channel *src2,
448   const union tgsi_exec_channel *src3 )
449{
450   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
451   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
452   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
453   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
454}
455#endif
456
457static void
458micro_exp2(
459   union tgsi_exec_channel *dst,
460   const union tgsi_exec_channel *src)
461{
462#if FAST_MATH
463   dst->f[0] = util_fast_exp2( src->f[0] );
464   dst->f[1] = util_fast_exp2( src->f[1] );
465   dst->f[2] = util_fast_exp2( src->f[2] );
466   dst->f[3] = util_fast_exp2( src->f[3] );
467#else
468   dst->f[0] = powf( 2.0f, src->f[0] );
469   dst->f[1] = powf( 2.0f, src->f[1] );
470   dst->f[2] = powf( 2.0f, src->f[2] );
471   dst->f[3] = powf( 2.0f, src->f[3] );
472#endif
473}
474
475#if 0
476static void
477micro_f2ut(
478   union tgsi_exec_channel *dst,
479   const union tgsi_exec_channel *src )
480{
481   dst->u[0] = (uint) src->f[0];
482   dst->u[1] = (uint) src->f[1];
483   dst->u[2] = (uint) src->f[2];
484   dst->u[3] = (uint) src->f[3];
485}
486#endif
487
488static void
489micro_flr(
490   union tgsi_exec_channel *dst,
491   const union tgsi_exec_channel *src )
492{
493   dst->f[0] = floorf( src->f[0] );
494   dst->f[1] = floorf( src->f[1] );
495   dst->f[2] = floorf( src->f[2] );
496   dst->f[3] = floorf( src->f[3] );
497}
498
499static void
500micro_frc(
501   union tgsi_exec_channel *dst,
502   const union tgsi_exec_channel *src )
503{
504   dst->f[0] = src->f[0] - floorf( src->f[0] );
505   dst->f[1] = src->f[1] - floorf( src->f[1] );
506   dst->f[2] = src->f[2] - floorf( src->f[2] );
507   dst->f[3] = src->f[3] - floorf( src->f[3] );
508}
509
510static void
511micro_ge(
512   union tgsi_exec_channel *dst,
513   const union tgsi_exec_channel *src0,
514   const union tgsi_exec_channel *src1,
515   const union tgsi_exec_channel *src2,
516   const union tgsi_exec_channel *src3 )
517{
518   dst->f[0] = src0->f[0] >= src1->f[0] ? src2->f[0] : src3->f[0];
519   dst->f[1] = src0->f[1] >= src1->f[1] ? src2->f[1] : src3->f[1];
520   dst->f[2] = src0->f[2] >= src1->f[2] ? src2->f[2] : src3->f[2];
521   dst->f[3] = src0->f[3] >= src1->f[3] ? src2->f[3] : src3->f[3];
522}
523
524static void
525micro_i2f(
526   union tgsi_exec_channel *dst,
527   const union tgsi_exec_channel *src )
528{
529   dst->f[0] = (float) src->i[0];
530   dst->f[1] = (float) src->i[1];
531   dst->f[2] = (float) src->i[2];
532   dst->f[3] = (float) src->i[3];
533}
534
535static void
536micro_lg2(
537   union tgsi_exec_channel *dst,
538   const union tgsi_exec_channel *src )
539{
540#if FAST_MATH
541   dst->f[0] = util_fast_log2( src->f[0] );
542   dst->f[1] = util_fast_log2( src->f[1] );
543   dst->f[2] = util_fast_log2( src->f[2] );
544   dst->f[3] = util_fast_log2( src->f[3] );
545#else
546   dst->f[0] = logf( src->f[0] ) * 1.442695f;
547   dst->f[1] = logf( src->f[1] ) * 1.442695f;
548   dst->f[2] = logf( src->f[2] ) * 1.442695f;
549   dst->f[3] = logf( src->f[3] ) * 1.442695f;
550#endif
551}
552
553static void
554micro_le(
555   union tgsi_exec_channel *dst,
556   const union tgsi_exec_channel *src0,
557   const union tgsi_exec_channel *src1,
558   const union tgsi_exec_channel *src2,
559   const union tgsi_exec_channel *src3 )
560{
561   dst->f[0] = src0->f[0] <= src1->f[0] ? src2->f[0] : src3->f[0];
562   dst->f[1] = src0->f[1] <= src1->f[1] ? src2->f[1] : src3->f[1];
563   dst->f[2] = src0->f[2] <= src1->f[2] ? src2->f[2] : src3->f[2];
564   dst->f[3] = src0->f[3] <= src1->f[3] ? src2->f[3] : src3->f[3];
565}
566
567static void
568micro_lt(
569   union tgsi_exec_channel *dst,
570   const union tgsi_exec_channel *src0,
571   const union tgsi_exec_channel *src1,
572   const union tgsi_exec_channel *src2,
573   const union tgsi_exec_channel *src3 )
574{
575   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
576   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
577   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
578   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
579}
580
581#if 0
582static void
583micro_ilt(
584   union tgsi_exec_channel *dst,
585   const union tgsi_exec_channel *src0,
586   const union tgsi_exec_channel *src1,
587   const union tgsi_exec_channel *src2,
588   const union tgsi_exec_channel *src3 )
589{
590   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
591   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
592   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
593   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
594}
595#endif
596
597#if 0
598static void
599micro_ult(
600   union tgsi_exec_channel *dst,
601   const union tgsi_exec_channel *src0,
602   const union tgsi_exec_channel *src1,
603   const union tgsi_exec_channel *src2,
604   const union tgsi_exec_channel *src3 )
605{
606   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
607   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
608   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
609   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
610}
611#endif
612
613static void
614micro_max(
615   union tgsi_exec_channel *dst,
616   const union tgsi_exec_channel *src0,
617   const union tgsi_exec_channel *src1 )
618{
619   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
620   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
621   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
622   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
623}
624
625#if 0
626static void
627micro_imax(
628   union tgsi_exec_channel *dst,
629   const union tgsi_exec_channel *src0,
630   const union tgsi_exec_channel *src1 )
631{
632   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
633   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
634   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
635   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
636}
637#endif
638
639#if 0
640static void
641micro_umax(
642   union tgsi_exec_channel *dst,
643   const union tgsi_exec_channel *src0,
644   const union tgsi_exec_channel *src1 )
645{
646   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
647   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
648   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
649   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
650}
651#endif
652
653static void
654micro_min(
655   union tgsi_exec_channel *dst,
656   const union tgsi_exec_channel *src0,
657   const union tgsi_exec_channel *src1 )
658{
659   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
660   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
661   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
662   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
663}
664
665#if 0
666static void
667micro_imin(
668   union tgsi_exec_channel *dst,
669   const union tgsi_exec_channel *src0,
670   const union tgsi_exec_channel *src1 )
671{
672   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
673   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
674   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
675   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
676}
677#endif
678
679#if 0
680static void
681micro_umin(
682   union tgsi_exec_channel *dst,
683   const union tgsi_exec_channel *src0,
684   const union tgsi_exec_channel *src1 )
685{
686   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
687   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
688   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
689   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
690}
691#endif
692
693#if 0
694static void
695micro_umod(
696   union tgsi_exec_channel *dst,
697   const union tgsi_exec_channel *src0,
698   const union tgsi_exec_channel *src1 )
699{
700   dst->u[0] = src0->u[0] % src1->u[0];
701   dst->u[1] = src0->u[1] % src1->u[1];
702   dst->u[2] = src0->u[2] % src1->u[2];
703   dst->u[3] = src0->u[3] % src1->u[3];
704}
705#endif
706
707static void
708micro_mul(
709   union tgsi_exec_channel *dst,
710   const union tgsi_exec_channel *src0,
711   const union tgsi_exec_channel *src1 )
712{
713   dst->f[0] = src0->f[0] * src1->f[0];
714   dst->f[1] = src0->f[1] * src1->f[1];
715   dst->f[2] = src0->f[2] * src1->f[2];
716   dst->f[3] = src0->f[3] * src1->f[3];
717}
718
719#if 0
720static void
721micro_imul(
722   union tgsi_exec_channel *dst,
723   const union tgsi_exec_channel *src0,
724   const union tgsi_exec_channel *src1 )
725{
726   dst->i[0] = src0->i[0] * src1->i[0];
727   dst->i[1] = src0->i[1] * src1->i[1];
728   dst->i[2] = src0->i[2] * src1->i[2];
729   dst->i[3] = src0->i[3] * src1->i[3];
730}
731#endif
732
733#if 0
734static void
735micro_imul64(
736   union tgsi_exec_channel *dst0,
737   union tgsi_exec_channel *dst1,
738   const union tgsi_exec_channel *src0,
739   const union tgsi_exec_channel *src1 )
740{
741   dst1->i[0] = src0->i[0] * src1->i[0];
742   dst1->i[1] = src0->i[1] * src1->i[1];
743   dst1->i[2] = src0->i[2] * src1->i[2];
744   dst1->i[3] = src0->i[3] * src1->i[3];
745   dst0->i[0] = 0;
746   dst0->i[1] = 0;
747   dst0->i[2] = 0;
748   dst0->i[3] = 0;
749}
750#endif
751
752#if 0
753static void
754micro_umul64(
755   union tgsi_exec_channel *dst0,
756   union tgsi_exec_channel *dst1,
757   const union tgsi_exec_channel *src0,
758   const union tgsi_exec_channel *src1 )
759{
760   dst1->u[0] = src0->u[0] * src1->u[0];
761   dst1->u[1] = src0->u[1] * src1->u[1];
762   dst1->u[2] = src0->u[2] * src1->u[2];
763   dst1->u[3] = src0->u[3] * src1->u[3];
764   dst0->u[0] = 0;
765   dst0->u[1] = 0;
766   dst0->u[2] = 0;
767   dst0->u[3] = 0;
768}
769#endif
770
771
772#if 0
773static void
774micro_movc(
775   union tgsi_exec_channel *dst,
776   const union tgsi_exec_channel *src0,
777   const union tgsi_exec_channel *src1,
778   const union tgsi_exec_channel *src2 )
779{
780   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
781   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
782   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
783   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
784}
785#endif
786
787static void
788micro_neg(
789   union tgsi_exec_channel *dst,
790   const union tgsi_exec_channel *src )
791{
792   dst->f[0] = -src->f[0];
793   dst->f[1] = -src->f[1];
794   dst->f[2] = -src->f[2];
795   dst->f[3] = -src->f[3];
796}
797
798#if 0
799static void
800micro_ineg(
801   union tgsi_exec_channel *dst,
802   const union tgsi_exec_channel *src )
803{
804   dst->i[0] = -src->i[0];
805   dst->i[1] = -src->i[1];
806   dst->i[2] = -src->i[2];
807   dst->i[3] = -src->i[3];
808}
809#endif
810
811static void
812micro_not(
813   union tgsi_exec_channel *dst,
814   const union tgsi_exec_channel *src )
815{
816   dst->u[0] = ~src->u[0];
817   dst->u[1] = ~src->u[1];
818   dst->u[2] = ~src->u[2];
819   dst->u[3] = ~src->u[3];
820}
821
822static void
823micro_or(
824   union tgsi_exec_channel *dst,
825   const union tgsi_exec_channel *src0,
826   const union tgsi_exec_channel *src1 )
827{
828   dst->u[0] = src0->u[0] | src1->u[0];
829   dst->u[1] = src0->u[1] | src1->u[1];
830   dst->u[2] = src0->u[2] | src1->u[2];
831   dst->u[3] = src0->u[3] | src1->u[3];
832}
833
834static void
835micro_pow(
836   union tgsi_exec_channel *dst,
837   const union tgsi_exec_channel *src0,
838   const union tgsi_exec_channel *src1 )
839{
840#if FAST_MATH
841   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
842   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
843   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
844   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
845#else
846   dst->f[0] = powf( src0->f[0], src1->f[0] );
847   dst->f[1] = powf( src0->f[1], src1->f[1] );
848   dst->f[2] = powf( src0->f[2], src1->f[2] );
849   dst->f[3] = powf( src0->f[3], src1->f[3] );
850#endif
851}
852
853static void
854micro_rnd(
855   union tgsi_exec_channel *dst,
856   const union tgsi_exec_channel *src )
857{
858   dst->f[0] = floorf( src->f[0] + 0.5f );
859   dst->f[1] = floorf( src->f[1] + 0.5f );
860   dst->f[2] = floorf( src->f[2] + 0.5f );
861   dst->f[3] = floorf( src->f[3] + 0.5f );
862}
863
864static void
865micro_sgn(
866   union tgsi_exec_channel *dst,
867   const union tgsi_exec_channel *src )
868{
869   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
870   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
871   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
872   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
873}
874
875static void
876micro_shl(
877   union tgsi_exec_channel *dst,
878   const union tgsi_exec_channel *src0,
879   const union tgsi_exec_channel *src1 )
880{
881   dst->i[0] = src0->i[0] << src1->i[0];
882   dst->i[1] = src0->i[1] << src1->i[1];
883   dst->i[2] = src0->i[2] << src1->i[2];
884   dst->i[3] = src0->i[3] << src1->i[3];
885}
886
887static void
888micro_ishr(
889   union tgsi_exec_channel *dst,
890   const union tgsi_exec_channel *src0,
891   const union tgsi_exec_channel *src1 )
892{
893   dst->i[0] = src0->i[0] >> src1->i[0];
894   dst->i[1] = src0->i[1] >> src1->i[1];
895   dst->i[2] = src0->i[2] >> src1->i[2];
896   dst->i[3] = src0->i[3] >> src1->i[3];
897}
898
899static void
900micro_trunc(
901   union tgsi_exec_channel *dst,
902   const union tgsi_exec_channel *src0 )
903{
904   dst->f[0] = (float) (int) src0->f[0];
905   dst->f[1] = (float) (int) src0->f[1];
906   dst->f[2] = (float) (int) src0->f[2];
907   dst->f[3] = (float) (int) src0->f[3];
908}
909
910#if 0
911static void
912micro_ushr(
913   union tgsi_exec_channel *dst,
914   const union tgsi_exec_channel *src0,
915   const union tgsi_exec_channel *src1 )
916{
917   dst->u[0] = src0->u[0] >> src1->u[0];
918   dst->u[1] = src0->u[1] >> src1->u[1];
919   dst->u[2] = src0->u[2] >> src1->u[2];
920   dst->u[3] = src0->u[3] >> src1->u[3];
921}
922#endif
923
924static void
925micro_sin(
926   union tgsi_exec_channel *dst,
927   const union tgsi_exec_channel *src )
928{
929   dst->f[0] = sinf( src->f[0] );
930   dst->f[1] = sinf( src->f[1] );
931   dst->f[2] = sinf( src->f[2] );
932   dst->f[3] = sinf( src->f[3] );
933}
934
935static void
936micro_sqrt( union tgsi_exec_channel *dst,
937            const union tgsi_exec_channel *src )
938{
939   dst->f[0] = sqrtf( src->f[0] );
940   dst->f[1] = sqrtf( src->f[1] );
941   dst->f[2] = sqrtf( src->f[2] );
942   dst->f[3] = sqrtf( src->f[3] );
943}
944
945static void
946micro_sub(
947   union tgsi_exec_channel *dst,
948   const union tgsi_exec_channel *src0,
949   const union tgsi_exec_channel *src1 )
950{
951   dst->f[0] = src0->f[0] - src1->f[0];
952   dst->f[1] = src0->f[1] - src1->f[1];
953   dst->f[2] = src0->f[2] - src1->f[2];
954   dst->f[3] = src0->f[3] - src1->f[3];
955}
956
957#if 0
958static void
959micro_u2f(
960   union tgsi_exec_channel *dst,
961   const union tgsi_exec_channel *src )
962{
963   dst->f[0] = (float) src->u[0];
964   dst->f[1] = (float) src->u[1];
965   dst->f[2] = (float) src->u[2];
966   dst->f[3] = (float) src->u[3];
967}
968#endif
969
970static void
971micro_xor(
972   union tgsi_exec_channel *dst,
973   const union tgsi_exec_channel *src0,
974   const union tgsi_exec_channel *src1 )
975{
976   dst->u[0] = src0->u[0] ^ src1->u[0];
977   dst->u[1] = src0->u[1] ^ src1->u[1];
978   dst->u[2] = src0->u[2] ^ src1->u[2];
979   dst->u[3] = src0->u[3] ^ src1->u[3];
980}
981
982static void
983fetch_src_file_channel(
984   const struct tgsi_exec_machine *mach,
985   const uint file,
986   const uint swizzle,
987   const union tgsi_exec_channel *index,
988   union tgsi_exec_channel *chan )
989{
990   switch( swizzle ) {
991   case TGSI_EXTSWIZZLE_X:
992   case TGSI_EXTSWIZZLE_Y:
993   case TGSI_EXTSWIZZLE_Z:
994   case TGSI_EXTSWIZZLE_W:
995      switch( file ) {
996      case TGSI_FILE_CONSTANT:
997         assert(mach->Consts);
998         if (index->i[0] < 0)
999            chan->f[0] = 0.0f;
1000         else
1001            chan->f[0] = mach->Consts[index->i[0]][swizzle];
1002         if (index->i[1] < 0)
1003            chan->f[1] = 0.0f;
1004         else
1005            chan->f[1] = mach->Consts[index->i[1]][swizzle];
1006         if (index->i[2] < 0)
1007            chan->f[2] = 0.0f;
1008         else
1009            chan->f[2] = mach->Consts[index->i[2]][swizzle];
1010         if (index->i[3] < 0)
1011            chan->f[3] = 0.0f;
1012         else
1013            chan->f[3] = mach->Consts[index->i[3]][swizzle];
1014         break;
1015
1016      case TGSI_FILE_INPUT:
1017         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
1018         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
1019         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
1020         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
1021         break;
1022
1023      case TGSI_FILE_TEMPORARY:
1024         assert(index->i[0] < TGSI_EXEC_NUM_TEMPS);
1025         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
1026         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
1027         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
1028         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
1029         break;
1030
1031      case TGSI_FILE_IMMEDIATE:
1032         assert( index->i[0] < (int) mach->ImmLimit );
1033         chan->f[0] = mach->Imms[index->i[0]][swizzle];
1034         assert( index->i[1] < (int) mach->ImmLimit );
1035         chan->f[1] = mach->Imms[index->i[1]][swizzle];
1036         assert( index->i[2] < (int) mach->ImmLimit );
1037         chan->f[2] = mach->Imms[index->i[2]][swizzle];
1038         assert( index->i[3] < (int) mach->ImmLimit );
1039         chan->f[3] = mach->Imms[index->i[3]][swizzle];
1040         break;
1041
1042      case TGSI_FILE_ADDRESS:
1043         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
1044         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
1045         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
1046         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
1047         break;
1048
1049      case TGSI_FILE_OUTPUT:
1050         /* vertex/fragment output vars can be read too */
1051         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
1052         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
1053         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
1054         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
1055         break;
1056
1057      default:
1058         assert( 0 );
1059      }
1060      break;
1061
1062   case TGSI_EXTSWIZZLE_ZERO:
1063      *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C];
1064      break;
1065
1066   case TGSI_EXTSWIZZLE_ONE:
1067      *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C];
1068      break;
1069
1070   default:
1071      assert( 0 );
1072   }
1073}
1074
1075static void
1076fetch_source(
1077   const struct tgsi_exec_machine *mach,
1078   union tgsi_exec_channel *chan,
1079   const struct tgsi_full_src_register *reg,
1080   const uint chan_index )
1081{
1082   union tgsi_exec_channel index;
1083   uint swizzle;
1084
1085   /* We start with a direct index into a register file.
1086    *
1087    *    file[1],
1088    *    where:
1089    *       file = SrcRegister.File
1090    *       [1] = SrcRegister.Index
1091    */
1092   index.i[0] =
1093   index.i[1] =
1094   index.i[2] =
1095   index.i[3] = reg->SrcRegister.Index;
1096
1097   /* There is an extra source register that indirectly subscripts
1098    * a register file. The direct index now becomes an offset
1099    * that is being added to the indirect register.
1100    *
1101    *    file[ind[2].x+1],
1102    *    where:
1103    *       ind = SrcRegisterInd.File
1104    *       [2] = SrcRegisterInd.Index
1105    *       .x = SrcRegisterInd.SwizzleX
1106    */
1107   if (reg->SrcRegister.Indirect) {
1108      union tgsi_exec_channel index2;
1109      union tgsi_exec_channel indir_index;
1110      const uint execmask = mach->ExecMask;
1111      uint i;
1112
1113      /* which address register (always zero now) */
1114      index2.i[0] =
1115      index2.i[1] =
1116      index2.i[2] =
1117      index2.i[3] = reg->SrcRegisterInd.Index;
1118
1119      /* get current value of address register[swizzle] */
1120      swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterInd, CHAN_X );
1121      fetch_src_file_channel(
1122         mach,
1123         reg->SrcRegisterInd.File,
1124         swizzle,
1125         &index2,
1126         &indir_index );
1127
1128      /* add value of address register to the offset */
1129      index.i[0] += (int) indir_index.f[0];
1130      index.i[1] += (int) indir_index.f[1];
1131      index.i[2] += (int) indir_index.f[2];
1132      index.i[3] += (int) indir_index.f[3];
1133
1134      /* for disabled execution channels, zero-out the index to
1135       * avoid using a potential garbage value.
1136       */
1137      for (i = 0; i < QUAD_SIZE; i++) {
1138         if ((execmask & (1 << i)) == 0)
1139            index.i[i] = 0;
1140      }
1141   }
1142
1143   /* There is an extra source register that is a second
1144    * subscript to a register file. Effectively it means that
1145    * the register file is actually a 2D array of registers.
1146    *
1147    *    file[1][3] == file[1*sizeof(file[1])+3],
1148    *    where:
1149    *       [3] = SrcRegisterDim.Index
1150    */
1151   if (reg->SrcRegister.Dimension) {
1152      /* The size of the first-order array depends on the register file type.
1153       * We need to multiply the index to the first array to get an effective,
1154       * "flat" index that points to the beginning of the second-order array.
1155       */
1156      switch (reg->SrcRegister.File) {
1157      case TGSI_FILE_INPUT:
1158         index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1159         index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1160         index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1161         index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1162         break;
1163      case TGSI_FILE_CONSTANT:
1164         index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
1165         index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
1166         index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
1167         index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
1168         break;
1169      default:
1170         assert( 0 );
1171      }
1172
1173      index.i[0] += reg->SrcRegisterDim.Index;
1174      index.i[1] += reg->SrcRegisterDim.Index;
1175      index.i[2] += reg->SrcRegisterDim.Index;
1176      index.i[3] += reg->SrcRegisterDim.Index;
1177
1178      /* Again, the second subscript index can be addressed indirectly
1179       * identically to the first one.
1180       * Nothing stops us from indirectly addressing the indirect register,
1181       * but there is no need for that, so we won't exercise it.
1182       *
1183       *    file[1][ind[4].y+3],
1184       *    where:
1185       *       ind = SrcRegisterDimInd.File
1186       *       [4] = SrcRegisterDimInd.Index
1187       *       .y = SrcRegisterDimInd.SwizzleX
1188       */
1189      if (reg->SrcRegisterDim.Indirect) {
1190         union tgsi_exec_channel index2;
1191         union tgsi_exec_channel indir_index;
1192         const uint execmask = mach->ExecMask;
1193         uint i;
1194
1195         index2.i[0] =
1196         index2.i[1] =
1197         index2.i[2] =
1198         index2.i[3] = reg->SrcRegisterDimInd.Index;
1199
1200         swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
1201         fetch_src_file_channel(
1202            mach,
1203            reg->SrcRegisterDimInd.File,
1204            swizzle,
1205            &index2,
1206            &indir_index );
1207
1208         index.i[0] += (int) indir_index.f[0];
1209         index.i[1] += (int) indir_index.f[1];
1210         index.i[2] += (int) indir_index.f[2];
1211         index.i[3] += (int) indir_index.f[3];
1212
1213         /* for disabled execution channels, zero-out the index to
1214          * avoid using a potential garbage value.
1215          */
1216         for (i = 0; i < QUAD_SIZE; i++) {
1217            if ((execmask & (1 << i)) == 0)
1218               index.i[i] = 0;
1219         }
1220      }
1221
1222      /* If by any chance there was a need for a 3D array of register
1223       * files, we would have to check whether SrcRegisterDim is followed
1224       * by a dimension register and continue the saga.
1225       */
1226   }
1227
1228   swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1229   fetch_src_file_channel(
1230      mach,
1231      reg->SrcRegister.File,
1232      swizzle,
1233      &index,
1234      chan );
1235
1236   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
1237   case TGSI_UTIL_SIGN_CLEAR:
1238      micro_abs( chan, chan );
1239      break;
1240
1241   case TGSI_UTIL_SIGN_SET:
1242      micro_abs( chan, chan );
1243      micro_neg( chan, chan );
1244      break;
1245
1246   case TGSI_UTIL_SIGN_TOGGLE:
1247      micro_neg( chan, chan );
1248      break;
1249
1250   case TGSI_UTIL_SIGN_KEEP:
1251      break;
1252   }
1253
1254   if (reg->SrcRegisterExtMod.Complement) {
1255      micro_sub( chan, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], chan );
1256   }
1257}
1258
1259static void
1260store_dest(
1261   struct tgsi_exec_machine *mach,
1262   const union tgsi_exec_channel *chan,
1263   const struct tgsi_full_dst_register *reg,
1264   const struct tgsi_full_instruction *inst,
1265   uint chan_index )
1266{
1267   uint i;
1268   union tgsi_exec_channel null;
1269   union tgsi_exec_channel *dst;
1270   uint execmask = mach->ExecMask;
1271
1272   switch (reg->DstRegister.File) {
1273   case TGSI_FILE_NULL:
1274      dst = &null;
1275      break;
1276
1277   case TGSI_FILE_OUTPUT:
1278      dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1279                           + reg->DstRegister.Index].xyzw[chan_index];
1280      break;
1281
1282   case TGSI_FILE_TEMPORARY:
1283      assert( reg->DstRegister.Index < TGSI_EXEC_NUM_TEMPS );
1284      dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index];
1285      break;
1286
1287   case TGSI_FILE_ADDRESS:
1288      dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index];
1289      break;
1290
1291   default:
1292      assert( 0 );
1293      return;
1294   }
1295
1296   if (inst->InstructionExtNv.CondFlowEnable) {
1297      union tgsi_exec_channel *cc = &mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C];
1298      uint swizzle;
1299      uint shift;
1300      uint mask;
1301      uint test;
1302
1303      /* Only CC0 supported.
1304       */
1305      assert( inst->InstructionExtNv.CondFlowIndex < 1 );
1306
1307      switch (chan_index) {
1308      case CHAN_X:
1309         swizzle = inst->InstructionExtNv.CondSwizzleX;
1310         break;
1311      case CHAN_Y:
1312         swizzle = inst->InstructionExtNv.CondSwizzleY;
1313         break;
1314      case CHAN_Z:
1315         swizzle = inst->InstructionExtNv.CondSwizzleZ;
1316         break;
1317      case CHAN_W:
1318         swizzle = inst->InstructionExtNv.CondSwizzleW;
1319         break;
1320      default:
1321         assert( 0 );
1322         return;
1323      }
1324
1325      switch (swizzle) {
1326      case TGSI_SWIZZLE_X:
1327         shift = TGSI_EXEC_CC_X_SHIFT;
1328         mask = TGSI_EXEC_CC_X_MASK;
1329         break;
1330      case TGSI_SWIZZLE_Y:
1331         shift = TGSI_EXEC_CC_Y_SHIFT;
1332         mask = TGSI_EXEC_CC_Y_MASK;
1333         break;
1334      case TGSI_SWIZZLE_Z:
1335         shift = TGSI_EXEC_CC_Z_SHIFT;
1336         mask = TGSI_EXEC_CC_Z_MASK;
1337         break;
1338      case TGSI_SWIZZLE_W:
1339         shift = TGSI_EXEC_CC_W_SHIFT;
1340         mask = TGSI_EXEC_CC_W_MASK;
1341         break;
1342      default:
1343         assert( 0 );
1344         return;
1345      }
1346
1347      switch (inst->InstructionExtNv.CondMask) {
1348      case TGSI_CC_GT:
1349         test = ~(TGSI_EXEC_CC_GT << shift) & mask;
1350         for (i = 0; i < QUAD_SIZE; i++)
1351            if (cc->u[i] & test)
1352               execmask &= ~(1 << i);
1353         break;
1354
1355      case TGSI_CC_EQ:
1356         test = ~(TGSI_EXEC_CC_EQ << shift) & mask;
1357         for (i = 0; i < QUAD_SIZE; i++)
1358            if (cc->u[i] & test)
1359               execmask &= ~(1 << i);
1360         break;
1361
1362      case TGSI_CC_LT:
1363         test = ~(TGSI_EXEC_CC_LT << shift) & mask;
1364         for (i = 0; i < QUAD_SIZE; i++)
1365            if (cc->u[i] & test)
1366               execmask &= ~(1 << i);
1367         break;
1368
1369      case TGSI_CC_GE:
1370         test = ~((TGSI_EXEC_CC_GT | TGSI_EXEC_CC_EQ) << shift) & mask;
1371         for (i = 0; i < QUAD_SIZE; i++)
1372            if (cc->u[i] & test)
1373               execmask &= ~(1 << i);
1374         break;
1375
1376      case TGSI_CC_LE:
1377         test = ~((TGSI_EXEC_CC_LT | TGSI_EXEC_CC_EQ) << shift) & mask;
1378         for (i = 0; i < QUAD_SIZE; i++)
1379            if (cc->u[i] & test)
1380               execmask &= ~(1 << i);
1381         break;
1382
1383      case TGSI_CC_NE:
1384         test = ~((TGSI_EXEC_CC_GT | TGSI_EXEC_CC_LT | TGSI_EXEC_CC_UN) << shift) & mask;
1385         for (i = 0; i < QUAD_SIZE; i++)
1386            if (cc->u[i] & test)
1387               execmask &= ~(1 << i);
1388         break;
1389
1390      case TGSI_CC_TR:
1391         break;
1392
1393      case TGSI_CC_FL:
1394         for (i = 0; i < QUAD_SIZE; i++)
1395            execmask &= ~(1 << i);
1396         break;
1397
1398      default:
1399         assert( 0 );
1400         return;
1401      }
1402   }
1403
1404   switch (inst->Instruction.Saturate) {
1405   case TGSI_SAT_NONE:
1406      for (i = 0; i < QUAD_SIZE; i++)
1407         if (execmask & (1 << i))
1408            dst->i[i] = chan->i[i];
1409      break;
1410
1411   case TGSI_SAT_ZERO_ONE:
1412      for (i = 0; i < QUAD_SIZE; i++)
1413         if (execmask & (1 << i)) {
1414            if (chan->f[i] < 0.0f)
1415               dst->f[i] = 0.0f;
1416            else if (chan->f[i] > 1.0f)
1417               dst->f[i] = 1.0f;
1418            else
1419               dst->i[i] = chan->i[i];
1420         }
1421      break;
1422
1423   case TGSI_SAT_MINUS_PLUS_ONE:
1424      for (i = 0; i < QUAD_SIZE; i++)
1425         if (execmask & (1 << i)) {
1426            if (chan->f[i] < -1.0f)
1427               dst->f[i] = -1.0f;
1428            else if (chan->f[i] > 1.0f)
1429               dst->f[i] = 1.0f;
1430            else
1431               dst->i[i] = chan->i[i];
1432         }
1433      break;
1434
1435   default:
1436      assert( 0 );
1437   }
1438
1439   if (inst->InstructionExtNv.CondDstUpdate) {
1440      union tgsi_exec_channel *cc = &mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C];
1441      uint shift;
1442      uint mask;
1443
1444      /* Only CC0 supported.
1445       */
1446      assert( inst->InstructionExtNv.CondDstIndex < 1 );
1447
1448      switch (chan_index) {
1449      case CHAN_X:
1450         shift = TGSI_EXEC_CC_X_SHIFT;
1451         mask = ~TGSI_EXEC_CC_X_MASK;
1452         break;
1453      case CHAN_Y:
1454         shift = TGSI_EXEC_CC_Y_SHIFT;
1455         mask = ~TGSI_EXEC_CC_Y_MASK;
1456         break;
1457      case CHAN_Z:
1458         shift = TGSI_EXEC_CC_Z_SHIFT;
1459         mask = ~TGSI_EXEC_CC_Z_MASK;
1460         break;
1461      case CHAN_W:
1462         shift = TGSI_EXEC_CC_W_SHIFT;
1463         mask = ~TGSI_EXEC_CC_W_MASK;
1464         break;
1465      default:
1466         assert( 0 );
1467         return;
1468      }
1469
1470      for (i = 0; i < QUAD_SIZE; i++)
1471         if (execmask & (1 << i)) {
1472            cc->u[i] &= mask;
1473            if (dst->f[i] < 0.0f)
1474               cc->u[i] |= TGSI_EXEC_CC_LT << shift;
1475            else if (dst->f[i] > 0.0f)
1476               cc->u[i] |= TGSI_EXEC_CC_GT << shift;
1477            else if (dst->f[i] == 0.0f)
1478               cc->u[i] |= TGSI_EXEC_CC_EQ << shift;
1479            else
1480               cc->u[i] |= TGSI_EXEC_CC_UN << shift;
1481         }
1482   }
1483}
1484
1485#define FETCH(VAL,INDEX,CHAN)\
1486    fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
1487
1488#define STORE(VAL,INDEX,CHAN)\
1489    store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
1490
1491
1492/**
1493 * Execute ARB-style KIL which is predicated by a src register.
1494 * Kill fragment if any of the four values is less than zero.
1495 */
1496static void
1497exec_kil(struct tgsi_exec_machine *mach,
1498         const struct tgsi_full_instruction *inst)
1499{
1500   uint uniquemask;
1501   uint chan_index;
1502   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1503   union tgsi_exec_channel r[1];
1504
1505   /* This mask stores component bits that were already tested. Note that
1506    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1507    * tested. */
1508   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1509
1510   for (chan_index = 0; chan_index < 4; chan_index++)
1511   {
1512      uint swizzle;
1513      uint i;
1514
1515      /* unswizzle channel */
1516      swizzle = tgsi_util_get_full_src_register_extswizzle (
1517                        &inst->FullSrcRegisters[0],
1518                        chan_index);
1519
1520      /* check if the component has not been already tested */
1521      if (uniquemask & (1 << swizzle))
1522         continue;
1523      uniquemask |= 1 << swizzle;
1524
1525      FETCH(&r[0], 0, chan_index);
1526      for (i = 0; i < 4; i++)
1527         if (r[0].f[i] < 0.0f)
1528            kilmask |= 1 << i;
1529   }
1530
1531   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1532}
1533
1534/**
1535 * Execute NVIDIA-style KIL which is predicated by a condition code.
1536 * Kill fragment if the condition code is TRUE.
1537 */
1538static void
1539exec_kilp(struct tgsi_exec_machine *mach,
1540          const struct tgsi_full_instruction *inst)
1541{
1542   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1543
1544   if (inst->InstructionExtNv.CondFlowEnable) {
1545      uint swizzle[4];
1546      uint chan_index;
1547
1548      kilmask = 0x0;
1549
1550      swizzle[0] = inst->InstructionExtNv.CondSwizzleX;
1551      swizzle[1] = inst->InstructionExtNv.CondSwizzleY;
1552      swizzle[2] = inst->InstructionExtNv.CondSwizzleZ;
1553      swizzle[3] = inst->InstructionExtNv.CondSwizzleW;
1554
1555      for (chan_index = 0; chan_index < 4; chan_index++)
1556      {
1557         uint i;
1558
1559         for (i = 0; i < 4; i++) {
1560            /* TODO: evaluate the condition code */
1561            if (0)
1562               kilmask |= 1 << i;
1563         }
1564      }
1565   }
1566   else {
1567      /* "unconditional" kil */
1568      kilmask = mach->ExecMask;
1569   }
1570   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1571}
1572
1573
1574/*
1575 * Fetch a four texture samples using STR texture coordinates.
1576 */
1577static void
1578fetch_texel( struct tgsi_sampler *sampler,
1579             const union tgsi_exec_channel *s,
1580             const union tgsi_exec_channel *t,
1581             const union tgsi_exec_channel *p,
1582             float lodbias,  /* XXX should be float[4] */
1583             union tgsi_exec_channel *r,
1584             union tgsi_exec_channel *g,
1585             union tgsi_exec_channel *b,
1586             union tgsi_exec_channel *a )
1587{
1588   uint j;
1589   float rgba[NUM_CHANNELS][QUAD_SIZE];
1590
1591   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
1592
1593   for (j = 0; j < 4; j++) {
1594      r->f[j] = rgba[0][j];
1595      g->f[j] = rgba[1][j];
1596      b->f[j] = rgba[2][j];
1597      a->f[j] = rgba[3][j];
1598   }
1599}
1600
1601
1602static void
1603exec_tex(struct tgsi_exec_machine *mach,
1604         const struct tgsi_full_instruction *inst,
1605         boolean biasLod,
1606         boolean projected)
1607{
1608   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
1609   union tgsi_exec_channel r[4];
1610   uint chan_index;
1611   float lodBias;
1612
1613   /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
1614
1615   switch (inst->InstructionExtTexture.Texture) {
1616   case TGSI_TEXTURE_1D:
1617
1618      FETCH(&r[0], 0, CHAN_X);
1619
1620      if (projected) {
1621         FETCH(&r[1], 0, CHAN_W);
1622         micro_div( &r[0], &r[0], &r[1] );
1623      }
1624
1625      if (biasLod) {
1626         FETCH(&r[1], 0, CHAN_W);
1627         lodBias = r[2].f[0];
1628      }
1629      else
1630         lodBias = 0.0;
1631
1632      fetch_texel(mach->Samplers[unit],
1633                  &r[0], NULL, NULL, lodBias,  /* S, T, P, BIAS */
1634                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
1635      break;
1636
1637   case TGSI_TEXTURE_2D:
1638   case TGSI_TEXTURE_RECT:
1639
1640      FETCH(&r[0], 0, CHAN_X);
1641      FETCH(&r[1], 0, CHAN_Y);
1642      FETCH(&r[2], 0, CHAN_Z);
1643
1644      if (projected) {
1645         FETCH(&r[3], 0, CHAN_W);
1646         micro_div( &r[0], &r[0], &r[3] );
1647         micro_div( &r[1], &r[1], &r[3] );
1648         micro_div( &r[2], &r[2], &r[3] );
1649      }
1650
1651      if (biasLod) {
1652         FETCH(&r[3], 0, CHAN_W);
1653         lodBias = r[3].f[0];
1654      }
1655      else
1656         lodBias = 0.0;
1657
1658      fetch_texel(mach->Samplers[unit],
1659                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
1660                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1661      break;
1662
1663   case TGSI_TEXTURE_3D:
1664   case TGSI_TEXTURE_CUBE:
1665
1666      FETCH(&r[0], 0, CHAN_X);
1667      FETCH(&r[1], 0, CHAN_Y);
1668      FETCH(&r[2], 0, CHAN_Z);
1669
1670      if (projected) {
1671         FETCH(&r[3], 0, CHAN_W);
1672         micro_div( &r[0], &r[0], &r[3] );
1673         micro_div( &r[1], &r[1], &r[3] );
1674         micro_div( &r[2], &r[2], &r[3] );
1675      }
1676
1677      if (biasLod) {
1678         FETCH(&r[3], 0, CHAN_W);
1679         lodBias = r[3].f[0];
1680      }
1681      else
1682         lodBias = 0.0;
1683
1684      fetch_texel(mach->Samplers[unit],
1685                  &r[0], &r[1], &r[2], lodBias,
1686                  &r[0], &r[1], &r[2], &r[3]);
1687      break;
1688
1689   default:
1690      assert (0);
1691   }
1692
1693   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1694      STORE( &r[chan_index], 0, chan_index );
1695   }
1696}
1697
1698
1699/**
1700 * Evaluate a constant-valued coefficient at the position of the
1701 * current quad.
1702 */
1703static void
1704eval_constant_coef(
1705   struct tgsi_exec_machine *mach,
1706   unsigned attrib,
1707   unsigned chan )
1708{
1709   unsigned i;
1710
1711   for( i = 0; i < QUAD_SIZE; i++ ) {
1712      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1713   }
1714}
1715
1716/**
1717 * Evaluate a linear-valued coefficient at the position of the
1718 * current quad.
1719 */
1720static void
1721eval_linear_coef(
1722   struct tgsi_exec_machine *mach,
1723   unsigned attrib,
1724   unsigned chan )
1725{
1726   const float x = mach->QuadPos.xyzw[0].f[0];
1727   const float y = mach->QuadPos.xyzw[1].f[0];
1728   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1729   const float dady = mach->InterpCoefs[attrib].dady[chan];
1730   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1731   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1732   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1733   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1734   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1735}
1736
1737/**
1738 * Evaluate a perspective-valued coefficient at the position of the
1739 * current quad.
1740 */
1741static void
1742eval_perspective_coef(
1743   struct tgsi_exec_machine *mach,
1744   unsigned attrib,
1745   unsigned chan )
1746{
1747   const float x = mach->QuadPos.xyzw[0].f[0];
1748   const float y = mach->QuadPos.xyzw[1].f[0];
1749   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1750   const float dady = mach->InterpCoefs[attrib].dady[chan];
1751   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1752   const float *w = mach->QuadPos.xyzw[3].f;
1753   /* divide by W here */
1754   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1755   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1756   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1757   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1758}
1759
1760
1761typedef void (* eval_coef_func)(
1762   struct tgsi_exec_machine *mach,
1763   unsigned attrib,
1764   unsigned chan );
1765
1766static void
1767exec_declaration(
1768   struct tgsi_exec_machine *mach,
1769   const struct tgsi_full_declaration *decl )
1770{
1771   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
1772      if( decl->Declaration.File == TGSI_FILE_INPUT ) {
1773         unsigned first, last, mask;
1774         eval_coef_func eval;
1775
1776         first = decl->DeclarationRange.First;
1777         last = decl->DeclarationRange.Last;
1778         mask = decl->Declaration.UsageMask;
1779
1780         switch( decl->Declaration.Interpolate ) {
1781         case TGSI_INTERPOLATE_CONSTANT:
1782            eval = eval_constant_coef;
1783            break;
1784
1785         case TGSI_INTERPOLATE_LINEAR:
1786            eval = eval_linear_coef;
1787            break;
1788
1789         case TGSI_INTERPOLATE_PERSPECTIVE:
1790            eval = eval_perspective_coef;
1791            break;
1792
1793         default:
1794            eval = NULL;
1795            assert( 0 );
1796         }
1797
1798         if( mask == TGSI_WRITEMASK_XYZW ) {
1799            unsigned i, j;
1800
1801            for( i = first; i <= last; i++ ) {
1802               for( j = 0; j < NUM_CHANNELS; j++ ) {
1803                  eval( mach, i, j );
1804               }
1805            }
1806         }
1807         else {
1808            unsigned i, j;
1809
1810            for( j = 0; j < NUM_CHANNELS; j++ ) {
1811               if( mask & (1 << j) ) {
1812                  for( i = first; i <= last; i++ ) {
1813                     eval( mach, i, j );
1814                  }
1815               }
1816            }
1817         }
1818      }
1819   }
1820}
1821
1822static void
1823exec_instruction(
1824   struct tgsi_exec_machine *mach,
1825   const struct tgsi_full_instruction *inst,
1826   int *pc )
1827{
1828   uint chan_index;
1829   union tgsi_exec_channel r[8];
1830
1831   (*pc)++;
1832
1833   switch (inst->Instruction.Opcode) {
1834   case TGSI_OPCODE_ARL:
1835      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1836         FETCH( &r[0], 0, chan_index );
1837         micro_flr( &r[0], &r[0] );
1838         STORE( &r[0], 0, chan_index );
1839      }
1840      break;
1841
1842   case TGSI_OPCODE_MOV:
1843   case TGSI_OPCODE_SWZ:
1844      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1845         FETCH( &r[0], 0, chan_index );
1846         STORE( &r[0], 0, chan_index );
1847      }
1848      break;
1849
1850   case TGSI_OPCODE_LIT:
1851      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1852	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
1853      }
1854
1855      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1856	 FETCH( &r[0], 0, CHAN_X );
1857	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1858	    micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1859	    STORE( &r[0], 0, CHAN_Y );
1860	 }
1861
1862	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1863	    FETCH( &r[1], 0, CHAN_Y );
1864	    micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1865
1866	    FETCH( &r[2], 0, CHAN_W );
1867	    micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
1868	    micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
1869	    micro_pow( &r[1], &r[1], &r[2] );
1870	    micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1871	    STORE( &r[0], 0, CHAN_Z );
1872	 }
1873      }
1874
1875      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1876	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1877      }
1878      break;
1879
1880   case TGSI_OPCODE_RCP:
1881   /* TGSI_OPCODE_RECIP */
1882      FETCH( &r[0], 0, CHAN_X );
1883      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
1884      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1885	 STORE( &r[0], 0, chan_index );
1886      }
1887      break;
1888
1889   case TGSI_OPCODE_RSQ:
1890   /* TGSI_OPCODE_RECIPSQRT */
1891      FETCH( &r[0], 0, CHAN_X );
1892      micro_sqrt( &r[0], &r[0] );
1893      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
1894      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1895	 STORE( &r[0], 0, chan_index );
1896      }
1897      break;
1898
1899   case TGSI_OPCODE_EXP:
1900      FETCH( &r[0], 0, CHAN_X );
1901      micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
1902      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1903         micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
1904         STORE( &r[2], 0, CHAN_X );        /* store r2 */
1905      }
1906      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1907         micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
1908         STORE( &r[2], 0, CHAN_Y );        /* store r2 */
1909      }
1910      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1911         micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
1912         STORE( &r[2], 0, CHAN_Z );        /* store r2 */
1913      }
1914      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1915         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1916      }
1917      break;
1918
1919   case TGSI_OPCODE_LOG:
1920      FETCH( &r[0], 0, CHAN_X );
1921      micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
1922      micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
1923      micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
1924      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1925         STORE( &r[0], 0, CHAN_X );
1926      }
1927      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1928         micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
1929         micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
1930         STORE( &r[0], 0, CHAN_Y );
1931      }
1932      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1933         STORE( &r[1], 0, CHAN_Z );
1934      }
1935      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1936         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1937      }
1938      break;
1939
1940   case TGSI_OPCODE_MUL:
1941      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
1942      {
1943         FETCH(&r[0], 0, chan_index);
1944         FETCH(&r[1], 1, chan_index);
1945
1946         micro_mul( &r[0], &r[0], &r[1] );
1947
1948         STORE(&r[0], 0, chan_index);
1949      }
1950      break;
1951
1952   case TGSI_OPCODE_ADD:
1953      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1954         FETCH( &r[0], 0, chan_index );
1955         FETCH( &r[1], 1, chan_index );
1956         micro_add( &r[0], &r[0], &r[1] );
1957         STORE( &r[0], 0, chan_index );
1958      }
1959      break;
1960
1961   case TGSI_OPCODE_DP3:
1962   /* TGSI_OPCODE_DOT3 */
1963      FETCH( &r[0], 0, CHAN_X );
1964      FETCH( &r[1], 1, CHAN_X );
1965      micro_mul( &r[0], &r[0], &r[1] );
1966
1967      FETCH( &r[1], 0, CHAN_Y );
1968      FETCH( &r[2], 1, CHAN_Y );
1969      micro_mul( &r[1], &r[1], &r[2] );
1970      micro_add( &r[0], &r[0], &r[1] );
1971
1972      FETCH( &r[1], 0, CHAN_Z );
1973      FETCH( &r[2], 1, CHAN_Z );
1974      micro_mul( &r[1], &r[1], &r[2] );
1975      micro_add( &r[0], &r[0], &r[1] );
1976
1977      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1978         STORE( &r[0], 0, chan_index );
1979      }
1980      break;
1981
1982    case TGSI_OPCODE_DP4:
1983    /* TGSI_OPCODE_DOT4 */
1984       FETCH(&r[0], 0, CHAN_X);
1985       FETCH(&r[1], 1, CHAN_X);
1986
1987       micro_mul( &r[0], &r[0], &r[1] );
1988
1989       FETCH(&r[1], 0, CHAN_Y);
1990       FETCH(&r[2], 1, CHAN_Y);
1991
1992       micro_mul( &r[1], &r[1], &r[2] );
1993       micro_add( &r[0], &r[0], &r[1] );
1994
1995       FETCH(&r[1], 0, CHAN_Z);
1996       FETCH(&r[2], 1, CHAN_Z);
1997
1998       micro_mul( &r[1], &r[1], &r[2] );
1999       micro_add( &r[0], &r[0], &r[1] );
2000
2001       FETCH(&r[1], 0, CHAN_W);
2002       FETCH(&r[2], 1, CHAN_W);
2003
2004       micro_mul( &r[1], &r[1], &r[2] );
2005       micro_add( &r[0], &r[0], &r[1] );
2006
2007      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2008	 STORE( &r[0], 0, chan_index );
2009      }
2010      break;
2011
2012   case TGSI_OPCODE_DST:
2013      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2014	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2015      }
2016
2017      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2018	 FETCH( &r[0], 0, CHAN_Y );
2019	 FETCH( &r[1], 1, CHAN_Y);
2020	 micro_mul( &r[0], &r[0], &r[1] );
2021	 STORE( &r[0], 0, CHAN_Y );
2022      }
2023
2024      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2025	 FETCH( &r[0], 0, CHAN_Z );
2026	 STORE( &r[0], 0, CHAN_Z );
2027      }
2028
2029      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2030	 FETCH( &r[0], 1, CHAN_W );
2031	 STORE( &r[0], 0, CHAN_W );
2032      }
2033      break;
2034
2035   case TGSI_OPCODE_MIN:
2036      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2037         FETCH(&r[0], 0, chan_index);
2038         FETCH(&r[1], 1, chan_index);
2039
2040         /* XXX use micro_min()?? */
2041         micro_lt( &r[0], &r[0], &r[1], &r[0], &r[1] );
2042
2043         STORE(&r[0], 0, chan_index);
2044      }
2045      break;
2046
2047   case TGSI_OPCODE_MAX:
2048      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2049         FETCH(&r[0], 0, chan_index);
2050         FETCH(&r[1], 1, chan_index);
2051
2052         /* XXX use micro_max()?? */
2053         micro_lt( &r[0], &r[0], &r[1], &r[1], &r[0] );
2054
2055         STORE(&r[0], 0, chan_index );
2056      }
2057      break;
2058
2059   case TGSI_OPCODE_SLT:
2060   /* TGSI_OPCODE_SETLT */
2061      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2062         FETCH( &r[0], 0, chan_index );
2063         FETCH( &r[1], 1, chan_index );
2064         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2065         STORE( &r[0], 0, chan_index );
2066      }
2067      break;
2068
2069   case TGSI_OPCODE_SGE:
2070   /* TGSI_OPCODE_SETGE */
2071      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2072         FETCH( &r[0], 0, chan_index );
2073         FETCH( &r[1], 1, chan_index );
2074         micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2075         STORE( &r[0], 0, chan_index );
2076      }
2077      break;
2078
2079   case TGSI_OPCODE_MAD:
2080   /* TGSI_OPCODE_MADD */
2081      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2082         FETCH( &r[0], 0, chan_index );
2083         FETCH( &r[1], 1, chan_index );
2084         micro_mul( &r[0], &r[0], &r[1] );
2085         FETCH( &r[1], 2, chan_index );
2086         micro_add( &r[0], &r[0], &r[1] );
2087         STORE( &r[0], 0, chan_index );
2088      }
2089      break;
2090
2091   case TGSI_OPCODE_SUB:
2092      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2093         FETCH(&r[0], 0, chan_index);
2094         FETCH(&r[1], 1, chan_index);
2095
2096         micro_sub( &r[0], &r[0], &r[1] );
2097
2098         STORE(&r[0], 0, chan_index);
2099      }
2100      break;
2101
2102   case TGSI_OPCODE_LERP:
2103   /* TGSI_OPCODE_LRP */
2104      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2105         FETCH(&r[0], 0, chan_index);
2106         FETCH(&r[1], 1, chan_index);
2107         FETCH(&r[2], 2, chan_index);
2108
2109         micro_sub( &r[1], &r[1], &r[2] );
2110         micro_mul( &r[0], &r[0], &r[1] );
2111         micro_add( &r[0], &r[0], &r[2] );
2112
2113         STORE(&r[0], 0, chan_index);
2114      }
2115      break;
2116
2117   case TGSI_OPCODE_CND:
2118      assert (0);
2119      break;
2120
2121   case TGSI_OPCODE_CND0:
2122      assert (0);
2123      break;
2124
2125   case TGSI_OPCODE_DOT2ADD:
2126      /* TGSI_OPCODE_DP2A */
2127      FETCH( &r[0], 0, CHAN_X );
2128      FETCH( &r[1], 1, CHAN_X );
2129      micro_mul( &r[0], &r[0], &r[1] );
2130
2131      FETCH( &r[1], 0, CHAN_Y );
2132      FETCH( &r[2], 1, CHAN_Y );
2133      micro_mul( &r[1], &r[1], &r[2] );
2134      micro_add( &r[0], &r[0], &r[1] );
2135
2136      FETCH( &r[2], 2, CHAN_X );
2137      micro_add( &r[0], &r[0], &r[2] );
2138
2139      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2140         STORE( &r[0], 0, chan_index );
2141      }
2142      break;
2143
2144   case TGSI_OPCODE_INDEX:
2145      assert (0);
2146      break;
2147
2148   case TGSI_OPCODE_NEGATE:
2149      assert (0);
2150      break;
2151
2152   case TGSI_OPCODE_FRAC:
2153   /* TGSI_OPCODE_FRC */
2154      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2155         FETCH( &r[0], 0, chan_index );
2156         micro_frc( &r[0], &r[0] );
2157         STORE( &r[0], 0, chan_index );
2158      }
2159      break;
2160
2161   case TGSI_OPCODE_CLAMP:
2162      assert (0);
2163      break;
2164
2165   case TGSI_OPCODE_FLOOR:
2166   /* TGSI_OPCODE_FLR */
2167      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2168         FETCH( &r[0], 0, chan_index );
2169         micro_flr( &r[0], &r[0] );
2170         STORE( &r[0], 0, chan_index );
2171      }
2172      break;
2173
2174   case TGSI_OPCODE_ROUND:
2175   case TGSI_OPCODE_ARR:
2176      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2177         FETCH( &r[0], 0, chan_index );
2178         micro_rnd( &r[0], &r[0] );
2179         STORE( &r[0], 0, chan_index );
2180      }
2181      break;
2182
2183   case TGSI_OPCODE_EXPBASE2:
2184    /* TGSI_OPCODE_EX2 */
2185      FETCH(&r[0], 0, CHAN_X);
2186
2187#if FAST_MATH
2188      micro_exp2( &r[0], &r[0] );
2189#else
2190      micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] );
2191#endif
2192
2193      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2194	 STORE( &r[0], 0, chan_index );
2195      }
2196      break;
2197
2198   case TGSI_OPCODE_LOGBASE2:
2199   /* TGSI_OPCODE_LG2 */
2200      FETCH( &r[0], 0, CHAN_X );
2201      micro_lg2( &r[0], &r[0] );
2202      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2203         STORE( &r[0], 0, chan_index );
2204      }
2205      break;
2206
2207   case TGSI_OPCODE_POWER:
2208      /* TGSI_OPCODE_POW */
2209      FETCH(&r[0], 0, CHAN_X);
2210      FETCH(&r[1], 1, CHAN_X);
2211
2212      micro_pow( &r[0], &r[0], &r[1] );
2213
2214      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2215	 STORE( &r[0], 0, chan_index );
2216      }
2217      break;
2218
2219   case TGSI_OPCODE_CROSSPRODUCT:
2220      /* TGSI_OPCODE_XPD */
2221      FETCH(&r[0], 0, CHAN_Y);
2222      FETCH(&r[1], 1, CHAN_Z);
2223
2224      micro_mul( &r[2], &r[0], &r[1] );
2225
2226      FETCH(&r[3], 0, CHAN_Z);
2227      FETCH(&r[4], 1, CHAN_Y);
2228
2229      micro_mul( &r[5], &r[3], &r[4] );
2230      micro_sub( &r[2], &r[2], &r[5] );
2231
2232      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2233         STORE( &r[2], 0, CHAN_X );
2234      }
2235
2236      FETCH(&r[2], 1, CHAN_X);
2237
2238      micro_mul( &r[3], &r[3], &r[2] );
2239
2240      FETCH(&r[5], 0, CHAN_X);
2241
2242      micro_mul( &r[1], &r[1], &r[5] );
2243      micro_sub( &r[3], &r[3], &r[1] );
2244
2245      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2246         STORE( &r[3], 0, CHAN_Y );
2247      }
2248
2249      micro_mul( &r[5], &r[5], &r[4] );
2250      micro_mul( &r[0], &r[0], &r[2] );
2251      micro_sub( &r[5], &r[5], &r[0] );
2252
2253      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2254         STORE( &r[5], 0, CHAN_Z );
2255      }
2256
2257      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2258         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2259      }
2260      break;
2261
2262    case TGSI_OPCODE_MULTIPLYMATRIX:
2263       assert (0);
2264       break;
2265
2266    case TGSI_OPCODE_ABS:
2267       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2268          FETCH(&r[0], 0, chan_index);
2269
2270          micro_abs( &r[0], &r[0] );
2271
2272          STORE(&r[0], 0, chan_index);
2273       }
2274       break;
2275
2276   case TGSI_OPCODE_RCC:
2277      assert (0);
2278      break;
2279
2280   case TGSI_OPCODE_DPH:
2281      FETCH(&r[0], 0, CHAN_X);
2282      FETCH(&r[1], 1, CHAN_X);
2283
2284      micro_mul( &r[0], &r[0], &r[1] );
2285
2286      FETCH(&r[1], 0, CHAN_Y);
2287      FETCH(&r[2], 1, CHAN_Y);
2288
2289      micro_mul( &r[1], &r[1], &r[2] );
2290      micro_add( &r[0], &r[0], &r[1] );
2291
2292      FETCH(&r[1], 0, CHAN_Z);
2293      FETCH(&r[2], 1, CHAN_Z);
2294
2295      micro_mul( &r[1], &r[1], &r[2] );
2296      micro_add( &r[0], &r[0], &r[1] );
2297
2298      FETCH(&r[1], 1, CHAN_W);
2299
2300      micro_add( &r[0], &r[0], &r[1] );
2301
2302      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2303	 STORE( &r[0], 0, chan_index );
2304      }
2305      break;
2306
2307   case TGSI_OPCODE_COS:
2308      FETCH(&r[0], 0, CHAN_X);
2309
2310      micro_cos( &r[0], &r[0] );
2311
2312      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2313	 STORE( &r[0], 0, chan_index );
2314      }
2315      break;
2316
2317   case TGSI_OPCODE_DDX:
2318      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2319         FETCH( &r[0], 0, chan_index );
2320         micro_ddx( &r[0], &r[0] );
2321         STORE( &r[0], 0, chan_index );
2322      }
2323      break;
2324
2325   case TGSI_OPCODE_DDY:
2326      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2327         FETCH( &r[0], 0, chan_index );
2328         micro_ddy( &r[0], &r[0] );
2329         STORE( &r[0], 0, chan_index );
2330      }
2331      break;
2332
2333   case TGSI_OPCODE_KILP:
2334      exec_kilp (mach, inst);
2335      break;
2336
2337   case TGSI_OPCODE_KIL:
2338      exec_kil (mach, inst);
2339      break;
2340
2341   case TGSI_OPCODE_PK2H:
2342      assert (0);
2343      break;
2344
2345   case TGSI_OPCODE_PK2US:
2346      assert (0);
2347      break;
2348
2349   case TGSI_OPCODE_PK4B:
2350      assert (0);
2351      break;
2352
2353   case TGSI_OPCODE_PK4UB:
2354      assert (0);
2355      break;
2356
2357   case TGSI_OPCODE_RFL:
2358      assert (0);
2359      break;
2360
2361   case TGSI_OPCODE_SEQ:
2362      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2363         FETCH( &r[0], 0, chan_index );
2364         FETCH( &r[1], 1, chan_index );
2365         micro_eq( &r[0], &r[0], &r[1],
2366                   &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C],
2367                   &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2368         STORE( &r[0], 0, chan_index );
2369      }
2370      break;
2371
2372   case TGSI_OPCODE_SFL:
2373      assert (0);
2374      break;
2375
2376   case TGSI_OPCODE_SGT:
2377      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2378         FETCH( &r[0], 0, chan_index );
2379         FETCH( &r[1], 1, chan_index );
2380         micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
2381         STORE( &r[0], 0, chan_index );
2382      }
2383      break;
2384
2385   case TGSI_OPCODE_SIN:
2386      FETCH( &r[0], 0, CHAN_X );
2387      micro_sin( &r[0], &r[0] );
2388      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2389         STORE( &r[0], 0, chan_index );
2390      }
2391      break;
2392
2393   case TGSI_OPCODE_SLE:
2394      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2395         FETCH( &r[0], 0, chan_index );
2396         FETCH( &r[1], 1, chan_index );
2397         micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2398         STORE( &r[0], 0, chan_index );
2399      }
2400      break;
2401
2402   case TGSI_OPCODE_SNE:
2403      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2404         FETCH( &r[0], 0, chan_index );
2405         FETCH( &r[1], 1, chan_index );
2406         micro_eq( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
2407         STORE( &r[0], 0, chan_index );
2408      }
2409      break;
2410
2411   case TGSI_OPCODE_STR:
2412      assert (0);
2413      break;
2414
2415   case TGSI_OPCODE_TEX:
2416      /* simple texture lookup */
2417      /* src[0] = texcoord */
2418      /* src[1] = sampler unit */
2419      exec_tex(mach, inst, FALSE, FALSE);
2420      break;
2421
2422   case TGSI_OPCODE_TXB:
2423      /* Texture lookup with lod bias */
2424      /* src[0] = texcoord (src[0].w = LOD bias) */
2425      /* src[1] = sampler unit */
2426      exec_tex(mach, inst, TRUE, FALSE);
2427      break;
2428
2429   case TGSI_OPCODE_TXD:
2430      /* Texture lookup with explict partial derivatives */
2431      /* src[0] = texcoord */
2432      /* src[1] = d[strq]/dx */
2433      /* src[2] = d[strq]/dy */
2434      /* src[3] = sampler unit */
2435      assert (0);
2436      break;
2437
2438   case TGSI_OPCODE_TXL:
2439      /* Texture lookup with explit LOD */
2440      /* src[0] = texcoord (src[0].w = LOD) */
2441      /* src[1] = sampler unit */
2442      exec_tex(mach, inst, TRUE, FALSE);
2443      break;
2444
2445   case TGSI_OPCODE_TXP:
2446      /* Texture lookup with projection */
2447      /* src[0] = texcoord (src[0].w = projection) */
2448      /* src[1] = sampler unit */
2449      exec_tex(mach, inst, FALSE, TRUE);
2450      break;
2451
2452   case TGSI_OPCODE_UP2H:
2453      assert (0);
2454      break;
2455
2456   case TGSI_OPCODE_UP2US:
2457      assert (0);
2458      break;
2459
2460   case TGSI_OPCODE_UP4B:
2461      assert (0);
2462      break;
2463
2464   case TGSI_OPCODE_UP4UB:
2465      assert (0);
2466      break;
2467
2468   case TGSI_OPCODE_X2D:
2469      assert (0);
2470      break;
2471
2472   case TGSI_OPCODE_ARA:
2473      assert (0);
2474      break;
2475
2476   case TGSI_OPCODE_BRA:
2477      assert (0);
2478      break;
2479
2480   case TGSI_OPCODE_CAL:
2481      /* skip the call if no execution channels are enabled */
2482      if (mach->ExecMask) {
2483         /* do the call */
2484
2485         /* push the Cond, Loop, Cont stacks */
2486         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2487         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2488         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2489         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2490         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2491         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2492
2493         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
2494         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
2495
2496         /* note that PC was already incremented above */
2497         mach->CallStack[mach->CallStackTop++] = *pc;
2498         *pc = inst->InstructionExtLabel.Label;
2499      }
2500      break;
2501
2502   case TGSI_OPCODE_RET:
2503      mach->FuncMask &= ~mach->ExecMask;
2504      UPDATE_EXEC_MASK(mach);
2505
2506      if (mach->FuncMask == 0x0) {
2507         /* really return now (otherwise, keep executing */
2508
2509         if (mach->CallStackTop == 0) {
2510            /* returning from main() */
2511            *pc = -1;
2512            return;
2513         }
2514         *pc = mach->CallStack[--mach->CallStackTop];
2515
2516         /* pop the Cond, Loop, Cont stacks */
2517         assert(mach->CondStackTop > 0);
2518         mach->CondMask = mach->CondStack[--mach->CondStackTop];
2519         assert(mach->LoopStackTop > 0);
2520         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
2521         assert(mach->ContStackTop > 0);
2522         mach->ContMask = mach->ContStack[--mach->ContStackTop];
2523         assert(mach->FuncStackTop > 0);
2524         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
2525
2526         UPDATE_EXEC_MASK(mach);
2527      }
2528      break;
2529
2530   case TGSI_OPCODE_SSG:
2531   /* TGSI_OPCODE_SGN */
2532      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2533         FETCH( &r[0], 0, chan_index );
2534         micro_sgn( &r[0], &r[0] );
2535         STORE( &r[0], 0, chan_index );
2536      }
2537      break;
2538
2539   case TGSI_OPCODE_CMP:
2540      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2541         FETCH(&r[0], 0, chan_index);
2542         FETCH(&r[1], 1, chan_index);
2543         FETCH(&r[2], 2, chan_index);
2544
2545         micro_lt( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2] );
2546
2547         STORE(&r[0], 0, chan_index);
2548      }
2549      break;
2550
2551   case TGSI_OPCODE_SCS:
2552      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2553         FETCH( &r[0], 0, CHAN_X );
2554      }
2555      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
2556         micro_cos( &r[1], &r[0] );
2557         STORE( &r[1], 0, CHAN_X );
2558      }
2559      if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2560         micro_sin( &r[1], &r[0] );
2561         STORE( &r[1], 0, CHAN_Y );
2562      }
2563      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2564         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
2565      }
2566      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
2567         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2568      }
2569      break;
2570
2571   case TGSI_OPCODE_NRM:
2572      /* 3-component vector normalize */
2573      {
2574         union tgsi_exec_channel tmp, dot;
2575
2576         /* tmp = dp3(src0, src0): */
2577         FETCH( &r[0], 0, CHAN_X );
2578         micro_mul( &tmp, &r[0], &r[0] );
2579
2580         FETCH( &r[1], 0, CHAN_Y );
2581         micro_mul( &dot, &r[1], &r[1] );
2582         micro_add( &tmp, &tmp, &dot );
2583
2584         FETCH( &r[2], 0, CHAN_Z );
2585         micro_mul( &dot, &r[2], &r[2] );
2586         micro_add( &tmp, &tmp, &dot );
2587
2588         /* tmp = 1 / sqrt(tmp) */
2589         micro_sqrt( &tmp, &tmp );
2590         micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
2591
2592         /* note: w channel is undefined */
2593         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2594            /* chan = chan * tmp */
2595            micro_mul( &r[chan_index], &tmp, &r[chan_index] );
2596            STORE( &r[chan_index], 0, chan_index );
2597         }
2598      }
2599      break;
2600
2601   case TGSI_OPCODE_NRM4:
2602      /* 4-component vector normalize */
2603      {
2604         union tgsi_exec_channel tmp, dot;
2605
2606         /* tmp = dp4(src0, src0): */
2607         FETCH( &r[0], 0, CHAN_X );
2608         micro_mul( &tmp, &r[0], &r[0] );
2609
2610         FETCH( &r[1], 0, CHAN_Y );
2611         micro_mul( &dot, &r[1], &r[1] );
2612         micro_add( &tmp, &tmp, &dot );
2613
2614         FETCH( &r[2], 0, CHAN_Z );
2615         micro_mul( &dot, &r[2], &r[2] );
2616         micro_add( &tmp, &tmp, &dot );
2617
2618         FETCH( &r[3], 0, CHAN_W );
2619         micro_mul( &dot, &r[3], &r[3] );
2620         micro_add( &tmp, &tmp, &dot );
2621
2622         /* tmp = 1 / sqrt(tmp) */
2623         micro_sqrt( &tmp, &tmp );
2624         micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
2625
2626         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2627            /* chan = chan * tmp */
2628            micro_mul( &r[chan_index], &tmp, &r[chan_index] );
2629            STORE( &r[chan_index], 0, chan_index );
2630         }
2631      }
2632      break;
2633
2634   case TGSI_OPCODE_DIV:
2635      assert( 0 );
2636      break;
2637
2638   case TGSI_OPCODE_DP2:
2639      FETCH( &r[0], 0, CHAN_X );
2640      FETCH( &r[1], 1, CHAN_X );
2641      micro_mul( &r[0], &r[0], &r[1] );
2642
2643      FETCH( &r[1], 0, CHAN_Y );
2644      FETCH( &r[2], 1, CHAN_Y );
2645      micro_mul( &r[1], &r[1], &r[2] );
2646      micro_add( &r[0], &r[0], &r[1] );
2647
2648      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2649         STORE( &r[0], 0, chan_index );
2650      }
2651      break;
2652
2653   case TGSI_OPCODE_IF:
2654      /* push CondMask */
2655      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2656      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2657      FETCH( &r[0], 0, CHAN_X );
2658      /* update CondMask */
2659      if( ! r[0].u[0] ) {
2660         mach->CondMask &= ~0x1;
2661      }
2662      if( ! r[0].u[1] ) {
2663         mach->CondMask &= ~0x2;
2664      }
2665      if( ! r[0].u[2] ) {
2666         mach->CondMask &= ~0x4;
2667      }
2668      if( ! r[0].u[3] ) {
2669         mach->CondMask &= ~0x8;
2670      }
2671      UPDATE_EXEC_MASK(mach);
2672      /* Todo: If CondMask==0, jump to ELSE */
2673      break;
2674
2675   case TGSI_OPCODE_ELSE:
2676      /* invert CondMask wrt previous mask */
2677      {
2678         uint prevMask;
2679         assert(mach->CondStackTop > 0);
2680         prevMask = mach->CondStack[mach->CondStackTop - 1];
2681         mach->CondMask = ~mach->CondMask & prevMask;
2682         UPDATE_EXEC_MASK(mach);
2683         /* Todo: If CondMask==0, jump to ENDIF */
2684      }
2685      break;
2686
2687   case TGSI_OPCODE_ENDIF:
2688      /* pop CondMask */
2689      assert(mach->CondStackTop > 0);
2690      mach->CondMask = mach->CondStack[--mach->CondStackTop];
2691      UPDATE_EXEC_MASK(mach);
2692      break;
2693
2694   case TGSI_OPCODE_END:
2695      /* halt execution */
2696      *pc = -1;
2697      break;
2698
2699   case TGSI_OPCODE_REP:
2700      assert (0);
2701      break;
2702
2703   case TGSI_OPCODE_ENDREP:
2704       assert (0);
2705       break;
2706
2707   case TGSI_OPCODE_PUSHA:
2708      assert (0);
2709      break;
2710
2711   case TGSI_OPCODE_POPA:
2712      assert (0);
2713      break;
2714
2715   case TGSI_OPCODE_CEIL:
2716      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2717         FETCH( &r[0], 0, chan_index );
2718         micro_ceil( &r[0], &r[0] );
2719         STORE( &r[0], 0, chan_index );
2720      }
2721      break;
2722
2723   case TGSI_OPCODE_I2F:
2724      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2725         FETCH( &r[0], 0, chan_index );
2726         micro_i2f( &r[0], &r[0] );
2727         STORE( &r[0], 0, chan_index );
2728      }
2729      break;
2730
2731   case TGSI_OPCODE_NOT:
2732      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2733         FETCH( &r[0], 0, chan_index );
2734         micro_not( &r[0], &r[0] );
2735         STORE( &r[0], 0, chan_index );
2736      }
2737      break;
2738
2739   case TGSI_OPCODE_TRUNC:
2740      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2741         FETCH( &r[0], 0, chan_index );
2742         micro_trunc( &r[0], &r[0] );
2743         STORE( &r[0], 0, chan_index );
2744      }
2745      break;
2746
2747   case TGSI_OPCODE_SHL:
2748      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2749         FETCH( &r[0], 0, chan_index );
2750         FETCH( &r[1], 1, chan_index );
2751         micro_shl( &r[0], &r[0], &r[1] );
2752         STORE( &r[0], 0, chan_index );
2753      }
2754      break;
2755
2756   case TGSI_OPCODE_SHR:
2757      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2758         FETCH( &r[0], 0, chan_index );
2759         FETCH( &r[1], 1, chan_index );
2760         micro_ishr( &r[0], &r[0], &r[1] );
2761         STORE( &r[0], 0, chan_index );
2762      }
2763      break;
2764
2765   case TGSI_OPCODE_AND:
2766      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2767         FETCH( &r[0], 0, chan_index );
2768         FETCH( &r[1], 1, chan_index );
2769         micro_and( &r[0], &r[0], &r[1] );
2770         STORE( &r[0], 0, chan_index );
2771      }
2772      break;
2773
2774   case TGSI_OPCODE_OR:
2775      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2776         FETCH( &r[0], 0, chan_index );
2777         FETCH( &r[1], 1, chan_index );
2778         micro_or( &r[0], &r[0], &r[1] );
2779         STORE( &r[0], 0, chan_index );
2780      }
2781      break;
2782
2783   case TGSI_OPCODE_MOD:
2784      assert (0);
2785      break;
2786
2787   case TGSI_OPCODE_XOR:
2788      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2789         FETCH( &r[0], 0, chan_index );
2790         FETCH( &r[1], 1, chan_index );
2791         micro_xor( &r[0], &r[0], &r[1] );
2792         STORE( &r[0], 0, chan_index );
2793      }
2794      break;
2795
2796   case TGSI_OPCODE_SAD:
2797      assert (0);
2798      break;
2799
2800   case TGSI_OPCODE_TXF:
2801      assert (0);
2802      break;
2803
2804   case TGSI_OPCODE_TXQ:
2805      assert (0);
2806      break;
2807
2808   case TGSI_OPCODE_EMIT:
2809      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
2810      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
2811      break;
2812
2813   case TGSI_OPCODE_ENDPRIM:
2814      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
2815      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
2816      break;
2817
2818   case TGSI_OPCODE_LOOP:
2819      /* fall-through (for now) */
2820   case TGSI_OPCODE_BGNLOOP2:
2821      /* push LoopMask and ContMasks */
2822      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2823      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2824      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2825      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2826      break;
2827
2828   case TGSI_OPCODE_ENDLOOP:
2829      /* fall-through (for now at least) */
2830   case TGSI_OPCODE_ENDLOOP2:
2831      /* Restore ContMask, but don't pop */
2832      assert(mach->ContStackTop > 0);
2833      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
2834      UPDATE_EXEC_MASK(mach);
2835      if (mach->ExecMask) {
2836         /* repeat loop: jump to instruction just past BGNLOOP */
2837         *pc = inst->InstructionExtLabel.Label + 1;
2838      }
2839      else {
2840         /* exit loop: pop LoopMask */
2841         assert(mach->LoopStackTop > 0);
2842         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
2843         /* pop ContMask */
2844         assert(mach->ContStackTop > 0);
2845         mach->ContMask = mach->ContStack[--mach->ContStackTop];
2846      }
2847      UPDATE_EXEC_MASK(mach);
2848      break;
2849
2850   case TGSI_OPCODE_BRK:
2851      /* turn off loop channels for each enabled exec channel */
2852      mach->LoopMask &= ~mach->ExecMask;
2853      /* Todo: if mach->LoopMask == 0, jump to end of loop */
2854      UPDATE_EXEC_MASK(mach);
2855      break;
2856
2857   case TGSI_OPCODE_CONT:
2858      /* turn off cont channels for each enabled exec channel */
2859      mach->ContMask &= ~mach->ExecMask;
2860      /* Todo: if mach->LoopMask == 0, jump to end of loop */
2861      UPDATE_EXEC_MASK(mach);
2862      break;
2863
2864   case TGSI_OPCODE_BGNSUB:
2865      /* no-op */
2866      break;
2867
2868   case TGSI_OPCODE_ENDSUB:
2869      /* no-op */
2870      break;
2871
2872   case TGSI_OPCODE_NOISE1:
2873      assert( 0 );
2874      break;
2875
2876   case TGSI_OPCODE_NOISE2:
2877      assert( 0 );
2878      break;
2879
2880   case TGSI_OPCODE_NOISE3:
2881      assert( 0 );
2882      break;
2883
2884   case TGSI_OPCODE_NOISE4:
2885      assert( 0 );
2886      break;
2887
2888   case TGSI_OPCODE_NOP:
2889      break;
2890
2891   default:
2892      assert( 0 );
2893   }
2894}
2895
2896
2897/**
2898 * Run TGSI interpreter.
2899 * \return bitmask of "alive" quad components
2900 */
2901uint
2902tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
2903{
2904   uint i;
2905   int pc = 0;
2906
2907   mach->CondMask = 0xf;
2908   mach->LoopMask = 0xf;
2909   mach->ContMask = 0xf;
2910   mach->FuncMask = 0xf;
2911   mach->ExecMask = 0xf;
2912
2913   mach->CondStackTop = 0; /* temporarily subvert this assertion */
2914   assert(mach->CondStackTop == 0);
2915   assert(mach->LoopStackTop == 0);
2916   assert(mach->ContStackTop == 0);
2917   assert(mach->CallStackTop == 0);
2918
2919   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
2920   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
2921
2922   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
2923      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
2924      mach->Primitives[0] = 0;
2925   }
2926
2927   for (i = 0; i < QUAD_SIZE; i++) {
2928      mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
2929         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
2930         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
2931         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
2932         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
2933   }
2934
2935   /* execute declarations (interpolants) */
2936   for (i = 0; i < mach->NumDeclarations; i++) {
2937      exec_declaration( mach, mach->Declarations+i );
2938   }
2939
2940   /* execute instructions, until pc is set to -1 */
2941   while (pc != -1) {
2942      assert(pc < (int) mach->NumInstructions);
2943      exec_instruction( mach, mach->Instructions + pc, &pc );
2944   }
2945
2946#if 0
2947   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
2948   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
2949      /*
2950       * Scale back depth component.
2951       */
2952      for (i = 0; i < 4; i++)
2953         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
2954   }
2955#endif
2956
2957   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
2958}
2959
2960
2961