tgsi_exec.c revision 38afa2934077ce1cf67d1c553f872d1e14fb0794
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28/**
29 * TGSI interpreter/executor.
30 *
31 * Flow control information:
32 *
33 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
34 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
35 * care since a condition may be true for some quad components but false
36 * for other components.
37 *
38 * We basically execute all statements (even if they're in the part of
39 * an IF/ELSE clause that's "not taken") and use a special mask to
40 * control writing to destination registers.  This is the ExecMask.
41 * See store_dest().
42 *
43 * The ExecMask is computed from three other masks (CondMask, LoopMask and
44 * ContMask) which are controlled by the flow control instructions (namely:
45 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
46 *
47 *
48 * Authors:
49 *   Michal Krol
50 *   Brian Paul
51 */
52
53#include "pipe/p_compiler.h"
54#include "pipe/p_state.h"
55#include "pipe/p_shader_tokens.h"
56#include "tgsi/tgsi_parse.h"
57#include "tgsi/tgsi_util.h"
58#include "tgsi_exec.h"
59#include "util/u_memory.h"
60#include "util/u_math.h"
61
62#define FAST_MATH 1
63
64#define TILE_TOP_LEFT     0
65#define TILE_TOP_RIGHT    1
66#define TILE_BOTTOM_LEFT  2
67#define TILE_BOTTOM_RIGHT 3
68
69#define CHAN_X  0
70#define CHAN_Y  1
71#define CHAN_Z  2
72#define CHAN_W  3
73
74/*
75 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
76 */
77#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
78#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
79#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
80#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
81#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
82#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
83#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
84#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
85#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
86#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
87#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
88#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
89#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
90#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
91#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
92#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
93#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
94#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
95#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
96#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
97#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
98#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
99#define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
100#define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
101#define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
102#define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
103#define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
104#define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
105#define TEMP_R0            TGSI_EXEC_TEMP_R0
106
107#define IS_CHANNEL_ENABLED(INST, CHAN)\
108   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
109
110#define IS_CHANNEL_ENABLED2(INST, CHAN)\
111   ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
112
113#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
114   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
115      if (IS_CHANNEL_ENABLED( INST, CHAN ))
116
117#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
118   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
119      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
120
121
122/** The execution mask depends on the conditional mask and the loop mask */
123#define UPDATE_EXEC_MASK(MACH) \
124      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
125
126/**
127 * Initialize machine state by expanding tokens to full instructions,
128 * allocating temporary storage, setting up constants, etc.
129 * After this, we can call tgsi_exec_machine_run() many times.
130 */
131void
132tgsi_exec_machine_bind_shader(
133   struct tgsi_exec_machine *mach,
134   const struct tgsi_token *tokens,
135   uint numSamplers,
136   struct tgsi_sampler **samplers)
137{
138   uint k;
139   struct tgsi_parse_context parse;
140   struct tgsi_exec_labels *labels = &mach->Labels;
141   struct tgsi_full_instruction *instructions;
142   struct tgsi_full_declaration *declarations;
143   uint maxInstructions = 10, numInstructions = 0;
144   uint maxDeclarations = 10, numDeclarations = 0;
145   uint instno = 0;
146
147#if 0
148   tgsi_dump(tokens, 0);
149#endif
150
151   util_init_math();
152
153   mach->Tokens = tokens;
154   mach->Samplers = samplers;
155
156   k = tgsi_parse_init (&parse, mach->Tokens);
157   if (k != TGSI_PARSE_OK) {
158      debug_printf( "Problem parsing!\n" );
159      return;
160   }
161
162   mach->Processor = parse.FullHeader.Processor.Processor;
163   mach->ImmLimit = 0;
164   labels->count = 0;
165
166   declarations = (struct tgsi_full_declaration *)
167      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
168
169   if (!declarations) {
170      return;
171   }
172
173   instructions = (struct tgsi_full_instruction *)
174      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
175
176   if (!instructions) {
177      FREE( declarations );
178      return;
179   }
180
181   while( !tgsi_parse_end_of_tokens( &parse ) ) {
182      uint pointer = parse.Position;
183      uint i;
184
185      tgsi_parse_token( &parse );
186      switch( parse.FullToken.Token.Type ) {
187      case TGSI_TOKEN_TYPE_DECLARATION:
188         /* save expanded declaration */
189         if (numDeclarations == maxDeclarations) {
190            declarations = REALLOC(declarations,
191                                   maxDeclarations
192                                   * sizeof(struct tgsi_full_declaration),
193                                   (maxDeclarations + 10)
194                                   * sizeof(struct tgsi_full_declaration));
195            maxDeclarations += 10;
196         }
197         memcpy(declarations + numDeclarations,
198                &parse.FullToken.FullDeclaration,
199                sizeof(declarations[0]));
200         numDeclarations++;
201         break;
202
203      case TGSI_TOKEN_TYPE_IMMEDIATE:
204         {
205            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
206            assert( size % 4 == 0 );
207            assert( mach->ImmLimit + size / 4 <= TGSI_EXEC_NUM_IMMEDIATES );
208
209            for( i = 0; i < size; i++ ) {
210               mach->Imms[mach->ImmLimit + i / 4][i % 4] =
211		  parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
212            }
213            mach->ImmLimit += size / 4;
214         }
215         break;
216
217      case TGSI_TOKEN_TYPE_INSTRUCTION:
218         assert( labels->count < MAX_LABELS );
219
220         labels->labels[labels->count][0] = instno;
221         labels->labels[labels->count][1] = pointer;
222         labels->count++;
223
224         /* save expanded instruction */
225         if (numInstructions == maxInstructions) {
226            instructions = REALLOC(instructions,
227                                   maxInstructions
228                                   * sizeof(struct tgsi_full_instruction),
229                                   (maxInstructions + 10)
230                                   * sizeof(struct tgsi_full_instruction));
231            maxInstructions += 10;
232         }
233         memcpy(instructions + numInstructions,
234                &parse.FullToken.FullInstruction,
235                sizeof(instructions[0]));
236         numInstructions++;
237         break;
238
239      default:
240         assert( 0 );
241      }
242   }
243   tgsi_parse_free (&parse);
244
245   if (mach->Declarations) {
246      FREE( mach->Declarations );
247   }
248   mach->Declarations = declarations;
249   mach->NumDeclarations = numDeclarations;
250
251   if (mach->Instructions) {
252      FREE( mach->Instructions );
253   }
254   mach->Instructions = instructions;
255   mach->NumInstructions = numInstructions;
256}
257
258
259void
260tgsi_exec_machine_init(
261   struct tgsi_exec_machine *mach )
262{
263   uint i;
264
265   mach->Temps = (struct tgsi_exec_vector *) tgsi_align_128bit( mach->_Temps);
266   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
267
268   /* Setup constants. */
269   for( i = 0; i < 4; i++ ) {
270      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
271      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
272      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
273      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
274      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
275      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
276      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
277      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
278      mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
279      mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
280   }
281}
282
283
284void
285tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach)
286{
287   if (mach->Instructions) {
288      FREE(mach->Instructions);
289      mach->Instructions = NULL;
290      mach->NumInstructions = 0;
291   }
292   if (mach->Declarations) {
293      FREE(mach->Declarations);
294      mach->Declarations = NULL;
295      mach->NumDeclarations = 0;
296   }
297}
298
299
300static void
301micro_abs(
302   union tgsi_exec_channel *dst,
303   const union tgsi_exec_channel *src )
304{
305   dst->f[0] = fabsf( src->f[0] );
306   dst->f[1] = fabsf( src->f[1] );
307   dst->f[2] = fabsf( src->f[2] );
308   dst->f[3] = fabsf( src->f[3] );
309}
310
311static void
312micro_add(
313   union tgsi_exec_channel *dst,
314   const union tgsi_exec_channel *src0,
315   const union tgsi_exec_channel *src1 )
316{
317   dst->f[0] = src0->f[0] + src1->f[0];
318   dst->f[1] = src0->f[1] + src1->f[1];
319   dst->f[2] = src0->f[2] + src1->f[2];
320   dst->f[3] = src0->f[3] + src1->f[3];
321}
322
323#if 0
324static void
325micro_iadd(
326   union tgsi_exec_channel *dst,
327   const union tgsi_exec_channel *src0,
328   const union tgsi_exec_channel *src1 )
329{
330   dst->i[0] = src0->i[0] + src1->i[0];
331   dst->i[1] = src0->i[1] + src1->i[1];
332   dst->i[2] = src0->i[2] + src1->i[2];
333   dst->i[3] = src0->i[3] + src1->i[3];
334}
335#endif
336
337static void
338micro_and(
339   union tgsi_exec_channel *dst,
340   const union tgsi_exec_channel *src0,
341   const union tgsi_exec_channel *src1 )
342{
343   dst->u[0] = src0->u[0] & src1->u[0];
344   dst->u[1] = src0->u[1] & src1->u[1];
345   dst->u[2] = src0->u[2] & src1->u[2];
346   dst->u[3] = src0->u[3] & src1->u[3];
347}
348
349static void
350micro_ceil(
351   union tgsi_exec_channel *dst,
352   const union tgsi_exec_channel *src )
353{
354   dst->f[0] = ceilf( src->f[0] );
355   dst->f[1] = ceilf( src->f[1] );
356   dst->f[2] = ceilf( src->f[2] );
357   dst->f[3] = ceilf( src->f[3] );
358}
359
360static void
361micro_cos(
362   union tgsi_exec_channel *dst,
363   const union tgsi_exec_channel *src )
364{
365   dst->f[0] = cosf( src->f[0] );
366   dst->f[1] = cosf( src->f[1] );
367   dst->f[2] = cosf( src->f[2] );
368   dst->f[3] = cosf( src->f[3] );
369}
370
371static void
372micro_ddx(
373   union tgsi_exec_channel *dst,
374   const union tgsi_exec_channel *src )
375{
376   dst->f[0] =
377   dst->f[1] =
378   dst->f[2] =
379   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
380}
381
382static void
383micro_ddy(
384   union tgsi_exec_channel *dst,
385   const union tgsi_exec_channel *src )
386{
387   dst->f[0] =
388   dst->f[1] =
389   dst->f[2] =
390   dst->f[3] = src->f[TILE_TOP_LEFT] - src->f[TILE_BOTTOM_LEFT];
391}
392
393static void
394micro_div(
395   union tgsi_exec_channel *dst,
396   const union tgsi_exec_channel *src0,
397   const union tgsi_exec_channel *src1 )
398{
399   if (src1->f[0] != 0) {
400      dst->f[0] = src0->f[0] / src1->f[0];
401   }
402   if (src1->f[1] != 0) {
403      dst->f[1] = src0->f[1] / src1->f[1];
404   }
405   if (src1->f[2] != 0) {
406      dst->f[2] = src0->f[2] / src1->f[2];
407   }
408   if (src1->f[3] != 0) {
409      dst->f[3] = src0->f[3] / src1->f[3];
410   }
411}
412
413#if 0
414static void
415micro_udiv(
416   union tgsi_exec_channel *dst,
417   const union tgsi_exec_channel *src0,
418   const union tgsi_exec_channel *src1 )
419{
420   dst->u[0] = src0->u[0] / src1->u[0];
421   dst->u[1] = src0->u[1] / src1->u[1];
422   dst->u[2] = src0->u[2] / src1->u[2];
423   dst->u[3] = src0->u[3] / src1->u[3];
424}
425#endif
426
427static void
428micro_eq(
429   union tgsi_exec_channel *dst,
430   const union tgsi_exec_channel *src0,
431   const union tgsi_exec_channel *src1,
432   const union tgsi_exec_channel *src2,
433   const union tgsi_exec_channel *src3 )
434{
435   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
436   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
437   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
438   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
439}
440
441#if 0
442static void
443micro_ieq(
444   union tgsi_exec_channel *dst,
445   const union tgsi_exec_channel *src0,
446   const union tgsi_exec_channel *src1,
447   const union tgsi_exec_channel *src2,
448   const union tgsi_exec_channel *src3 )
449{
450   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
451   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
452   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
453   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
454}
455#endif
456
457static void
458micro_exp2(
459   union tgsi_exec_channel *dst,
460   const union tgsi_exec_channel *src)
461{
462#if FAST_MATH
463   dst->f[0] = util_fast_exp2( src->f[0] );
464   dst->f[1] = util_fast_exp2( src->f[1] );
465   dst->f[2] = util_fast_exp2( src->f[2] );
466   dst->f[3] = util_fast_exp2( src->f[3] );
467#else
468   dst->f[0] = powf( 2.0f, src->f[0] );
469   dst->f[1] = powf( 2.0f, src->f[1] );
470   dst->f[2] = powf( 2.0f, src->f[2] );
471   dst->f[3] = powf( 2.0f, src->f[3] );
472#endif
473}
474
475#if 0
476static void
477micro_f2ut(
478   union tgsi_exec_channel *dst,
479   const union tgsi_exec_channel *src )
480{
481   dst->u[0] = (uint) src->f[0];
482   dst->u[1] = (uint) src->f[1];
483   dst->u[2] = (uint) src->f[2];
484   dst->u[3] = (uint) src->f[3];
485}
486#endif
487
488static void
489micro_flr(
490   union tgsi_exec_channel *dst,
491   const union tgsi_exec_channel *src )
492{
493   dst->f[0] = floorf( src->f[0] );
494   dst->f[1] = floorf( src->f[1] );
495   dst->f[2] = floorf( src->f[2] );
496   dst->f[3] = floorf( src->f[3] );
497}
498
499static void
500micro_frc(
501   union tgsi_exec_channel *dst,
502   const union tgsi_exec_channel *src )
503{
504   dst->f[0] = src->f[0] - floorf( src->f[0] );
505   dst->f[1] = src->f[1] - floorf( src->f[1] );
506   dst->f[2] = src->f[2] - floorf( src->f[2] );
507   dst->f[3] = src->f[3] - floorf( src->f[3] );
508}
509
510static void
511micro_i2f(
512   union tgsi_exec_channel *dst,
513   const union tgsi_exec_channel *src )
514{
515   dst->f[0] = (float) src->i[0];
516   dst->f[1] = (float) src->i[1];
517   dst->f[2] = (float) src->i[2];
518   dst->f[3] = (float) src->i[3];
519}
520
521static void
522micro_lg2(
523   union tgsi_exec_channel *dst,
524   const union tgsi_exec_channel *src )
525{
526#if FAST_MATH
527   dst->f[0] = util_fast_log2( src->f[0] );
528   dst->f[1] = util_fast_log2( src->f[1] );
529   dst->f[2] = util_fast_log2( src->f[2] );
530   dst->f[3] = util_fast_log2( src->f[3] );
531#else
532   dst->f[0] = logf( src->f[0] ) * 1.442695f;
533   dst->f[1] = logf( src->f[1] ) * 1.442695f;
534   dst->f[2] = logf( src->f[2] ) * 1.442695f;
535   dst->f[3] = logf( src->f[3] ) * 1.442695f;
536#endif
537}
538
539static void
540micro_le(
541   union tgsi_exec_channel *dst,
542   const union tgsi_exec_channel *src0,
543   const union tgsi_exec_channel *src1,
544   const union tgsi_exec_channel *src2,
545   const union tgsi_exec_channel *src3 )
546{
547   dst->f[0] = src0->f[0] <= src1->f[0] ? src2->f[0] : src3->f[0];
548   dst->f[1] = src0->f[1] <= src1->f[1] ? src2->f[1] : src3->f[1];
549   dst->f[2] = src0->f[2] <= src1->f[2] ? src2->f[2] : src3->f[2];
550   dst->f[3] = src0->f[3] <= src1->f[3] ? src2->f[3] : src3->f[3];
551}
552
553static void
554micro_lt(
555   union tgsi_exec_channel *dst,
556   const union tgsi_exec_channel *src0,
557   const union tgsi_exec_channel *src1,
558   const union tgsi_exec_channel *src2,
559   const union tgsi_exec_channel *src3 )
560{
561   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
562   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
563   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
564   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
565}
566
567#if 0
568static void
569micro_ilt(
570   union tgsi_exec_channel *dst,
571   const union tgsi_exec_channel *src0,
572   const union tgsi_exec_channel *src1,
573   const union tgsi_exec_channel *src2,
574   const union tgsi_exec_channel *src3 )
575{
576   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
577   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
578   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
579   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
580}
581#endif
582
583#if 0
584static void
585micro_ult(
586   union tgsi_exec_channel *dst,
587   const union tgsi_exec_channel *src0,
588   const union tgsi_exec_channel *src1,
589   const union tgsi_exec_channel *src2,
590   const union tgsi_exec_channel *src3 )
591{
592   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
593   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
594   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
595   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
596}
597#endif
598
599static void
600micro_max(
601   union tgsi_exec_channel *dst,
602   const union tgsi_exec_channel *src0,
603   const union tgsi_exec_channel *src1 )
604{
605   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
606   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
607   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
608   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
609}
610
611#if 0
612static void
613micro_imax(
614   union tgsi_exec_channel *dst,
615   const union tgsi_exec_channel *src0,
616   const union tgsi_exec_channel *src1 )
617{
618   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
619   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
620   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
621   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
622}
623#endif
624
625#if 0
626static void
627micro_umax(
628   union tgsi_exec_channel *dst,
629   const union tgsi_exec_channel *src0,
630   const union tgsi_exec_channel *src1 )
631{
632   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
633   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
634   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
635   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
636}
637#endif
638
639static void
640micro_min(
641   union tgsi_exec_channel *dst,
642   const union tgsi_exec_channel *src0,
643   const union tgsi_exec_channel *src1 )
644{
645   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
646   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
647   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
648   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
649}
650
651#if 0
652static void
653micro_imin(
654   union tgsi_exec_channel *dst,
655   const union tgsi_exec_channel *src0,
656   const union tgsi_exec_channel *src1 )
657{
658   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
659   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
660   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
661   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
662}
663#endif
664
665#if 0
666static void
667micro_umin(
668   union tgsi_exec_channel *dst,
669   const union tgsi_exec_channel *src0,
670   const union tgsi_exec_channel *src1 )
671{
672   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
673   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
674   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
675   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
676}
677#endif
678
679#if 0
680static void
681micro_umod(
682   union tgsi_exec_channel *dst,
683   const union tgsi_exec_channel *src0,
684   const union tgsi_exec_channel *src1 )
685{
686   dst->u[0] = src0->u[0] % src1->u[0];
687   dst->u[1] = src0->u[1] % src1->u[1];
688   dst->u[2] = src0->u[2] % src1->u[2];
689   dst->u[3] = src0->u[3] % src1->u[3];
690}
691#endif
692
693static void
694micro_mul(
695   union tgsi_exec_channel *dst,
696   const union tgsi_exec_channel *src0,
697   const union tgsi_exec_channel *src1 )
698{
699   dst->f[0] = src0->f[0] * src1->f[0];
700   dst->f[1] = src0->f[1] * src1->f[1];
701   dst->f[2] = src0->f[2] * src1->f[2];
702   dst->f[3] = src0->f[3] * src1->f[3];
703}
704
705#if 0
706static void
707micro_imul(
708   union tgsi_exec_channel *dst,
709   const union tgsi_exec_channel *src0,
710   const union tgsi_exec_channel *src1 )
711{
712   dst->i[0] = src0->i[0] * src1->i[0];
713   dst->i[1] = src0->i[1] * src1->i[1];
714   dst->i[2] = src0->i[2] * src1->i[2];
715   dst->i[3] = src0->i[3] * src1->i[3];
716}
717#endif
718
719#if 0
720static void
721micro_imul64(
722   union tgsi_exec_channel *dst0,
723   union tgsi_exec_channel *dst1,
724   const union tgsi_exec_channel *src0,
725   const union tgsi_exec_channel *src1 )
726{
727   dst1->i[0] = src0->i[0] * src1->i[0];
728   dst1->i[1] = src0->i[1] * src1->i[1];
729   dst1->i[2] = src0->i[2] * src1->i[2];
730   dst1->i[3] = src0->i[3] * src1->i[3];
731   dst0->i[0] = 0;
732   dst0->i[1] = 0;
733   dst0->i[2] = 0;
734   dst0->i[3] = 0;
735}
736#endif
737
738#if 0
739static void
740micro_umul64(
741   union tgsi_exec_channel *dst0,
742   union tgsi_exec_channel *dst1,
743   const union tgsi_exec_channel *src0,
744   const union tgsi_exec_channel *src1 )
745{
746   dst1->u[0] = src0->u[0] * src1->u[0];
747   dst1->u[1] = src0->u[1] * src1->u[1];
748   dst1->u[2] = src0->u[2] * src1->u[2];
749   dst1->u[3] = src0->u[3] * src1->u[3];
750   dst0->u[0] = 0;
751   dst0->u[1] = 0;
752   dst0->u[2] = 0;
753   dst0->u[3] = 0;
754}
755#endif
756
757
758#if 0
759static void
760micro_movc(
761   union tgsi_exec_channel *dst,
762   const union tgsi_exec_channel *src0,
763   const union tgsi_exec_channel *src1,
764   const union tgsi_exec_channel *src2 )
765{
766   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
767   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
768   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
769   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
770}
771#endif
772
773static void
774micro_neg(
775   union tgsi_exec_channel *dst,
776   const union tgsi_exec_channel *src )
777{
778   dst->f[0] = -src->f[0];
779   dst->f[1] = -src->f[1];
780   dst->f[2] = -src->f[2];
781   dst->f[3] = -src->f[3];
782}
783
784#if 0
785static void
786micro_ineg(
787   union tgsi_exec_channel *dst,
788   const union tgsi_exec_channel *src )
789{
790   dst->i[0] = -src->i[0];
791   dst->i[1] = -src->i[1];
792   dst->i[2] = -src->i[2];
793   dst->i[3] = -src->i[3];
794}
795#endif
796
797static void
798micro_not(
799   union tgsi_exec_channel *dst,
800   const union tgsi_exec_channel *src )
801{
802   dst->u[0] = ~src->u[0];
803   dst->u[1] = ~src->u[1];
804   dst->u[2] = ~src->u[2];
805   dst->u[3] = ~src->u[3];
806}
807
808static void
809micro_or(
810   union tgsi_exec_channel *dst,
811   const union tgsi_exec_channel *src0,
812   const union tgsi_exec_channel *src1 )
813{
814   dst->u[0] = src0->u[0] | src1->u[0];
815   dst->u[1] = src0->u[1] | src1->u[1];
816   dst->u[2] = src0->u[2] | src1->u[2];
817   dst->u[3] = src0->u[3] | src1->u[3];
818}
819
820static void
821micro_pow(
822   union tgsi_exec_channel *dst,
823   const union tgsi_exec_channel *src0,
824   const union tgsi_exec_channel *src1 )
825{
826#if FAST_MATH
827   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
828   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
829   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
830   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
831#else
832   dst->f[0] = powf( src0->f[0], src1->f[0] );
833   dst->f[1] = powf( src0->f[1], src1->f[1] );
834   dst->f[2] = powf( src0->f[2], src1->f[2] );
835   dst->f[3] = powf( src0->f[3], src1->f[3] );
836#endif
837}
838
839static void
840micro_rnd(
841   union tgsi_exec_channel *dst,
842   const union tgsi_exec_channel *src )
843{
844   dst->f[0] = floorf( src->f[0] + 0.5f );
845   dst->f[1] = floorf( src->f[1] + 0.5f );
846   dst->f[2] = floorf( src->f[2] + 0.5f );
847   dst->f[3] = floorf( src->f[3] + 0.5f );
848}
849
850static void
851micro_sgn(
852   union tgsi_exec_channel *dst,
853   const union tgsi_exec_channel *src )
854{
855   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
856   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
857   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
858   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
859}
860
861static void
862micro_shl(
863   union tgsi_exec_channel *dst,
864   const union tgsi_exec_channel *src0,
865   const union tgsi_exec_channel *src1 )
866{
867   dst->i[0] = src0->i[0] << src1->i[0];
868   dst->i[1] = src0->i[1] << src1->i[1];
869   dst->i[2] = src0->i[2] << src1->i[2];
870   dst->i[3] = src0->i[3] << src1->i[3];
871}
872
873static void
874micro_ishr(
875   union tgsi_exec_channel *dst,
876   const union tgsi_exec_channel *src0,
877   const union tgsi_exec_channel *src1 )
878{
879   dst->i[0] = src0->i[0] >> src1->i[0];
880   dst->i[1] = src0->i[1] >> src1->i[1];
881   dst->i[2] = src0->i[2] >> src1->i[2];
882   dst->i[3] = src0->i[3] >> src1->i[3];
883}
884
885static void
886micro_trunc(
887   union tgsi_exec_channel *dst,
888   const union tgsi_exec_channel *src0 )
889{
890   dst->f[0] = (float) (int) src0->f[0];
891   dst->f[1] = (float) (int) src0->f[1];
892   dst->f[2] = (float) (int) src0->f[2];
893   dst->f[3] = (float) (int) src0->f[3];
894}
895
896#if 0
897static void
898micro_ushr(
899   union tgsi_exec_channel *dst,
900   const union tgsi_exec_channel *src0,
901   const union tgsi_exec_channel *src1 )
902{
903   dst->u[0] = src0->u[0] >> src1->u[0];
904   dst->u[1] = src0->u[1] >> src1->u[1];
905   dst->u[2] = src0->u[2] >> src1->u[2];
906   dst->u[3] = src0->u[3] >> src1->u[3];
907}
908#endif
909
910static void
911micro_sin(
912   union tgsi_exec_channel *dst,
913   const union tgsi_exec_channel *src )
914{
915   dst->f[0] = sinf( src->f[0] );
916   dst->f[1] = sinf( src->f[1] );
917   dst->f[2] = sinf( src->f[2] );
918   dst->f[3] = sinf( src->f[3] );
919}
920
921static void
922micro_sqrt( union tgsi_exec_channel *dst,
923            const union tgsi_exec_channel *src )
924{
925   dst->f[0] = sqrtf( src->f[0] );
926   dst->f[1] = sqrtf( src->f[1] );
927   dst->f[2] = sqrtf( src->f[2] );
928   dst->f[3] = sqrtf( src->f[3] );
929}
930
931static void
932micro_sub(
933   union tgsi_exec_channel *dst,
934   const union tgsi_exec_channel *src0,
935   const union tgsi_exec_channel *src1 )
936{
937   dst->f[0] = src0->f[0] - src1->f[0];
938   dst->f[1] = src0->f[1] - src1->f[1];
939   dst->f[2] = src0->f[2] - src1->f[2];
940   dst->f[3] = src0->f[3] - src1->f[3];
941}
942
943#if 0
944static void
945micro_u2f(
946   union tgsi_exec_channel *dst,
947   const union tgsi_exec_channel *src )
948{
949   dst->f[0] = (float) src->u[0];
950   dst->f[1] = (float) src->u[1];
951   dst->f[2] = (float) src->u[2];
952   dst->f[3] = (float) src->u[3];
953}
954#endif
955
956static void
957micro_xor(
958   union tgsi_exec_channel *dst,
959   const union tgsi_exec_channel *src0,
960   const union tgsi_exec_channel *src1 )
961{
962   dst->u[0] = src0->u[0] ^ src1->u[0];
963   dst->u[1] = src0->u[1] ^ src1->u[1];
964   dst->u[2] = src0->u[2] ^ src1->u[2];
965   dst->u[3] = src0->u[3] ^ src1->u[3];
966}
967
968static void
969fetch_src_file_channel(
970   const struct tgsi_exec_machine *mach,
971   const uint file,
972   const uint swizzle,
973   const union tgsi_exec_channel *index,
974   union tgsi_exec_channel *chan )
975{
976   switch( swizzle ) {
977   case TGSI_EXTSWIZZLE_X:
978   case TGSI_EXTSWIZZLE_Y:
979   case TGSI_EXTSWIZZLE_Z:
980   case TGSI_EXTSWIZZLE_W:
981      switch( file ) {
982      case TGSI_FILE_CONSTANT:
983         assert(mach->Consts);
984         if (index->i[0] < 0)
985            chan->f[0] = 0.0f;
986         else
987            chan->f[0] = mach->Consts[index->i[0]][swizzle];
988         if (index->i[1] < 0)
989            chan->f[1] = 0.0f;
990         else
991            chan->f[1] = mach->Consts[index->i[1]][swizzle];
992         if (index->i[2] < 0)
993            chan->f[2] = 0.0f;
994         else
995            chan->f[2] = mach->Consts[index->i[2]][swizzle];
996         if (index->i[3] < 0)
997            chan->f[3] = 0.0f;
998         else
999            chan->f[3] = mach->Consts[index->i[3]][swizzle];
1000         break;
1001
1002      case TGSI_FILE_INPUT:
1003         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
1004         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
1005         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
1006         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
1007         break;
1008
1009      case TGSI_FILE_TEMPORARY:
1010         assert(index->i[0] < TGSI_EXEC_NUM_TEMPS);
1011         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
1012         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
1013         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
1014         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
1015         break;
1016
1017      case TGSI_FILE_IMMEDIATE:
1018         assert( index->i[0] < (int) mach->ImmLimit );
1019         chan->f[0] = mach->Imms[index->i[0]][swizzle];
1020         assert( index->i[1] < (int) mach->ImmLimit );
1021         chan->f[1] = mach->Imms[index->i[1]][swizzle];
1022         assert( index->i[2] < (int) mach->ImmLimit );
1023         chan->f[2] = mach->Imms[index->i[2]][swizzle];
1024         assert( index->i[3] < (int) mach->ImmLimit );
1025         chan->f[3] = mach->Imms[index->i[3]][swizzle];
1026         break;
1027
1028      case TGSI_FILE_ADDRESS:
1029         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
1030         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
1031         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
1032         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
1033         break;
1034
1035      case TGSI_FILE_OUTPUT:
1036         /* vertex/fragment output vars can be read too */
1037         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
1038         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
1039         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
1040         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
1041         break;
1042
1043      default:
1044         assert( 0 );
1045      }
1046      break;
1047
1048   case TGSI_EXTSWIZZLE_ZERO:
1049      *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C];
1050      break;
1051
1052   case TGSI_EXTSWIZZLE_ONE:
1053      *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C];
1054      break;
1055
1056   default:
1057      assert( 0 );
1058   }
1059}
1060
1061static void
1062fetch_source(
1063   const struct tgsi_exec_machine *mach,
1064   union tgsi_exec_channel *chan,
1065   const struct tgsi_full_src_register *reg,
1066   const uint chan_index )
1067{
1068   union tgsi_exec_channel index;
1069   uint swizzle;
1070
1071   /* We start with a direct index into a register file.
1072    *
1073    *    file[1],
1074    *    where:
1075    *       file = SrcRegister.File
1076    *       [1] = SrcRegister.Index
1077    */
1078   index.i[0] =
1079   index.i[1] =
1080   index.i[2] =
1081   index.i[3] = reg->SrcRegister.Index;
1082
1083   /* There is an extra source register that indirectly subscripts
1084    * a register file. The direct index now becomes an offset
1085    * that is being added to the indirect register.
1086    *
1087    *    file[ind[2].x+1],
1088    *    where:
1089    *       ind = SrcRegisterInd.File
1090    *       [2] = SrcRegisterInd.Index
1091    *       .x = SrcRegisterInd.SwizzleX
1092    */
1093   if (reg->SrcRegister.Indirect) {
1094      union tgsi_exec_channel index2;
1095      union tgsi_exec_channel indir_index;
1096      const uint execmask = mach->ExecMask;
1097      uint i;
1098
1099      /* which address register (always zero now) */
1100      index2.i[0] =
1101      index2.i[1] =
1102      index2.i[2] =
1103      index2.i[3] = reg->SrcRegisterInd.Index;
1104
1105      /* get current value of address register[swizzle] */
1106      swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterInd, CHAN_X );
1107      fetch_src_file_channel(
1108         mach,
1109         reg->SrcRegisterInd.File,
1110         swizzle,
1111         &index2,
1112         &indir_index );
1113
1114      /* add value of address register to the offset */
1115      index.i[0] += (int) indir_index.f[0];
1116      index.i[1] += (int) indir_index.f[1];
1117      index.i[2] += (int) indir_index.f[2];
1118      index.i[3] += (int) indir_index.f[3];
1119
1120      /* for disabled execution channels, zero-out the index to
1121       * avoid using a potential garbage value.
1122       */
1123      for (i = 0; i < QUAD_SIZE; i++) {
1124         if ((execmask & (1 << i)) == 0)
1125            index.i[i] = 0;
1126      }
1127   }
1128
1129   /* There is an extra source register that is a second
1130    * subscript to a register file. Effectively it means that
1131    * the register file is actually a 2D array of registers.
1132    *
1133    *    file[1][3] == file[1*sizeof(file[1])+3],
1134    *    where:
1135    *       [3] = SrcRegisterDim.Index
1136    */
1137   if (reg->SrcRegister.Dimension) {
1138      /* The size of the first-order array depends on the register file type.
1139       * We need to multiply the index to the first array to get an effective,
1140       * "flat" index that points to the beginning of the second-order array.
1141       */
1142      switch (reg->SrcRegister.File) {
1143      case TGSI_FILE_INPUT:
1144         index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1145         index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1146         index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1147         index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1148         break;
1149      case TGSI_FILE_CONSTANT:
1150         index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
1151         index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
1152         index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
1153         index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
1154         break;
1155      default:
1156         assert( 0 );
1157      }
1158
1159      index.i[0] += reg->SrcRegisterDim.Index;
1160      index.i[1] += reg->SrcRegisterDim.Index;
1161      index.i[2] += reg->SrcRegisterDim.Index;
1162      index.i[3] += reg->SrcRegisterDim.Index;
1163
1164      /* Again, the second subscript index can be addressed indirectly
1165       * identically to the first one.
1166       * Nothing stops us from indirectly addressing the indirect register,
1167       * but there is no need for that, so we won't exercise it.
1168       *
1169       *    file[1][ind[4].y+3],
1170       *    where:
1171       *       ind = SrcRegisterDimInd.File
1172       *       [4] = SrcRegisterDimInd.Index
1173       *       .y = SrcRegisterDimInd.SwizzleX
1174       */
1175      if (reg->SrcRegisterDim.Indirect) {
1176         union tgsi_exec_channel index2;
1177         union tgsi_exec_channel indir_index;
1178         const uint execmask = mach->ExecMask;
1179         uint i;
1180
1181         index2.i[0] =
1182         index2.i[1] =
1183         index2.i[2] =
1184         index2.i[3] = reg->SrcRegisterDimInd.Index;
1185
1186         swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
1187         fetch_src_file_channel(
1188            mach,
1189            reg->SrcRegisterDimInd.File,
1190            swizzle,
1191            &index2,
1192            &indir_index );
1193
1194         index.i[0] += (int) indir_index.f[0];
1195         index.i[1] += (int) indir_index.f[1];
1196         index.i[2] += (int) indir_index.f[2];
1197         index.i[3] += (int) indir_index.f[3];
1198
1199         /* for disabled execution channels, zero-out the index to
1200          * avoid using a potential garbage value.
1201          */
1202         for (i = 0; i < QUAD_SIZE; i++) {
1203            if ((execmask & (1 << i)) == 0)
1204               index.i[i] = 0;
1205         }
1206      }
1207
1208      /* If by any chance there was a need for a 3D array of register
1209       * files, we would have to check whether SrcRegisterDim is followed
1210       * by a dimension register and continue the saga.
1211       */
1212   }
1213
1214   swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1215   fetch_src_file_channel(
1216      mach,
1217      reg->SrcRegister.File,
1218      swizzle,
1219      &index,
1220      chan );
1221
1222   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
1223   case TGSI_UTIL_SIGN_CLEAR:
1224      micro_abs( chan, chan );
1225      break;
1226
1227   case TGSI_UTIL_SIGN_SET:
1228      micro_abs( chan, chan );
1229      micro_neg( chan, chan );
1230      break;
1231
1232   case TGSI_UTIL_SIGN_TOGGLE:
1233      micro_neg( chan, chan );
1234      break;
1235
1236   case TGSI_UTIL_SIGN_KEEP:
1237      break;
1238   }
1239
1240   if (reg->SrcRegisterExtMod.Complement) {
1241      micro_sub( chan, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], chan );
1242   }
1243}
1244
1245static void
1246store_dest(
1247   struct tgsi_exec_machine *mach,
1248   const union tgsi_exec_channel *chan,
1249   const struct tgsi_full_dst_register *reg,
1250   const struct tgsi_full_instruction *inst,
1251   uint chan_index )
1252{
1253   uint i;
1254   union tgsi_exec_channel null;
1255   union tgsi_exec_channel *dst;
1256   uint execmask = mach->ExecMask;
1257
1258   switch (reg->DstRegister.File) {
1259   case TGSI_FILE_NULL:
1260      dst = &null;
1261      break;
1262
1263   case TGSI_FILE_OUTPUT:
1264      dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1265                           + reg->DstRegister.Index].xyzw[chan_index];
1266      break;
1267
1268   case TGSI_FILE_TEMPORARY:
1269      assert( reg->DstRegister.Index < TGSI_EXEC_NUM_TEMPS );
1270      dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index];
1271      break;
1272
1273   case TGSI_FILE_ADDRESS:
1274      dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index];
1275      break;
1276
1277   default:
1278      assert( 0 );
1279      return;
1280   }
1281
1282   if (inst->InstructionExtNv.CondFlowEnable) {
1283      union tgsi_exec_channel *cc = &mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C];
1284      uint swizzle;
1285      uint shift;
1286      uint mask;
1287      uint test;
1288
1289      /* Only CC0 supported.
1290       */
1291      assert( inst->InstructionExtNv.CondFlowIndex < 1 );
1292
1293      switch (chan_index) {
1294      case CHAN_X:
1295         swizzle = inst->InstructionExtNv.CondSwizzleX;
1296         break;
1297      case CHAN_Y:
1298         swizzle = inst->InstructionExtNv.CondSwizzleY;
1299         break;
1300      case CHAN_Z:
1301         swizzle = inst->InstructionExtNv.CondSwizzleZ;
1302         break;
1303      case CHAN_W:
1304         swizzle = inst->InstructionExtNv.CondSwizzleW;
1305         break;
1306      default:
1307         assert( 0 );
1308         return;
1309      }
1310
1311      switch (swizzle) {
1312      case TGSI_SWIZZLE_X:
1313         shift = TGSI_EXEC_CC_X_SHIFT;
1314         mask = TGSI_EXEC_CC_X_MASK;
1315         break;
1316      case TGSI_SWIZZLE_Y:
1317         shift = TGSI_EXEC_CC_Y_SHIFT;
1318         mask = TGSI_EXEC_CC_Y_MASK;
1319         break;
1320      case TGSI_SWIZZLE_Z:
1321         shift = TGSI_EXEC_CC_Z_SHIFT;
1322         mask = TGSI_EXEC_CC_Z_MASK;
1323         break;
1324      case TGSI_SWIZZLE_W:
1325         shift = TGSI_EXEC_CC_W_SHIFT;
1326         mask = TGSI_EXEC_CC_W_MASK;
1327         break;
1328      default:
1329         assert( 0 );
1330         return;
1331      }
1332
1333      switch (inst->InstructionExtNv.CondMask) {
1334      case TGSI_CC_GT:
1335         test = ~(TGSI_EXEC_CC_GT << shift) & mask;
1336         for (i = 0; i < QUAD_SIZE; i++)
1337            if (cc->u[i] & test)
1338               execmask &= ~(1 << i);
1339         break;
1340
1341      case TGSI_CC_EQ:
1342         test = ~(TGSI_EXEC_CC_EQ << shift) & mask;
1343         for (i = 0; i < QUAD_SIZE; i++)
1344            if (cc->u[i] & test)
1345               execmask &= ~(1 << i);
1346         break;
1347
1348      case TGSI_CC_LT:
1349         test = ~(TGSI_EXEC_CC_LT << shift) & mask;
1350         for (i = 0; i < QUAD_SIZE; i++)
1351            if (cc->u[i] & test)
1352               execmask &= ~(1 << i);
1353         break;
1354
1355      case TGSI_CC_GE:
1356         test = ~((TGSI_EXEC_CC_GT | TGSI_EXEC_CC_EQ) << shift) & mask;
1357         for (i = 0; i < QUAD_SIZE; i++)
1358            if (cc->u[i] & test)
1359               execmask &= ~(1 << i);
1360         break;
1361
1362      case TGSI_CC_LE:
1363         test = ~((TGSI_EXEC_CC_LT | TGSI_EXEC_CC_EQ) << shift) & mask;
1364         for (i = 0; i < QUAD_SIZE; i++)
1365            if (cc->u[i] & test)
1366               execmask &= ~(1 << i);
1367         break;
1368
1369      case TGSI_CC_NE:
1370         test = ~((TGSI_EXEC_CC_GT | TGSI_EXEC_CC_LT | TGSI_EXEC_CC_UN) << shift) & mask;
1371         for (i = 0; i < QUAD_SIZE; i++)
1372            if (cc->u[i] & test)
1373               execmask &= ~(1 << i);
1374         break;
1375
1376      case TGSI_CC_TR:
1377         break;
1378
1379      case TGSI_CC_FL:
1380         for (i = 0; i < QUAD_SIZE; i++)
1381            execmask &= ~(1 << i);
1382         break;
1383
1384      default:
1385         assert( 0 );
1386         return;
1387      }
1388   }
1389
1390   switch (inst->Instruction.Saturate) {
1391   case TGSI_SAT_NONE:
1392      for (i = 0; i < QUAD_SIZE; i++)
1393         if (execmask & (1 << i))
1394            dst->i[i] = chan->i[i];
1395      break;
1396
1397   case TGSI_SAT_ZERO_ONE:
1398      for (i = 0; i < QUAD_SIZE; i++)
1399         if (execmask & (1 << i)) {
1400            if (chan->f[i] < 0.0f)
1401               dst->f[i] = 0.0f;
1402            else if (chan->f[i] > 1.0f)
1403               dst->f[i] = 1.0f;
1404            else
1405               dst->i[i] = chan->i[i];
1406         }
1407      break;
1408
1409   case TGSI_SAT_MINUS_PLUS_ONE:
1410      for (i = 0; i < QUAD_SIZE; i++)
1411         if (execmask & (1 << i)) {
1412            if (chan->f[i] < -1.0f)
1413               dst->f[i] = -1.0f;
1414            else if (chan->f[i] > 1.0f)
1415               dst->f[i] = 1.0f;
1416            else
1417               dst->i[i] = chan->i[i];
1418         }
1419      break;
1420
1421   default:
1422      assert( 0 );
1423   }
1424
1425   if (inst->InstructionExtNv.CondDstUpdate) {
1426      union tgsi_exec_channel *cc = &mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C];
1427      uint shift;
1428      uint mask;
1429
1430      /* Only CC0 supported.
1431       */
1432      assert( inst->InstructionExtNv.CondDstIndex < 1 );
1433
1434      switch (chan_index) {
1435      case CHAN_X:
1436         shift = TGSI_EXEC_CC_X_SHIFT;
1437         mask = ~TGSI_EXEC_CC_X_MASK;
1438         break;
1439      case CHAN_Y:
1440         shift = TGSI_EXEC_CC_Y_SHIFT;
1441         mask = ~TGSI_EXEC_CC_Y_MASK;
1442         break;
1443      case CHAN_Z:
1444         shift = TGSI_EXEC_CC_Z_SHIFT;
1445         mask = ~TGSI_EXEC_CC_Z_MASK;
1446         break;
1447      case CHAN_W:
1448         shift = TGSI_EXEC_CC_W_SHIFT;
1449         mask = ~TGSI_EXEC_CC_W_MASK;
1450         break;
1451      default:
1452         assert( 0 );
1453         return;
1454      }
1455
1456      for (i = 0; i < QUAD_SIZE; i++)
1457         if (execmask & (1 << i)) {
1458            cc->u[i] &= mask;
1459            if (dst->f[i] < 0.0f)
1460               cc->u[i] |= TGSI_EXEC_CC_LT << shift;
1461            else if (dst->f[i] > 0.0f)
1462               cc->u[i] |= TGSI_EXEC_CC_GT << shift;
1463            else if (dst->f[i] == 0.0f)
1464               cc->u[i] |= TGSI_EXEC_CC_EQ << shift;
1465            else
1466               cc->u[i] |= TGSI_EXEC_CC_UN << shift;
1467         }
1468   }
1469}
1470
1471#define FETCH(VAL,INDEX,CHAN)\
1472    fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
1473
1474#define STORE(VAL,INDEX,CHAN)\
1475    store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
1476
1477
1478/**
1479 * Execute ARB-style KIL which is predicated by a src register.
1480 * Kill fragment if any of the four values is less than zero.
1481 */
1482static void
1483exec_kil(struct tgsi_exec_machine *mach,
1484         const struct tgsi_full_instruction *inst)
1485{
1486   uint uniquemask;
1487   uint chan_index;
1488   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1489   union tgsi_exec_channel r[1];
1490
1491   /* This mask stores component bits that were already tested. Note that
1492    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1493    * tested. */
1494   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1495
1496   for (chan_index = 0; chan_index < 4; chan_index++)
1497   {
1498      uint swizzle;
1499      uint i;
1500
1501      /* unswizzle channel */
1502      swizzle = tgsi_util_get_full_src_register_extswizzle (
1503                        &inst->FullSrcRegisters[0],
1504                        chan_index);
1505
1506      /* check if the component has not been already tested */
1507      if (uniquemask & (1 << swizzle))
1508         continue;
1509      uniquemask |= 1 << swizzle;
1510
1511      FETCH(&r[0], 0, chan_index);
1512      for (i = 0; i < 4; i++)
1513         if (r[0].f[i] < 0.0f)
1514            kilmask |= 1 << i;
1515   }
1516
1517   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1518}
1519
1520/**
1521 * Execute NVIDIA-style KIL which is predicated by a condition code.
1522 * Kill fragment if the condition code is TRUE.
1523 */
1524static void
1525exec_kilp(struct tgsi_exec_machine *mach,
1526          const struct tgsi_full_instruction *inst)
1527{
1528   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1529
1530   if (inst->InstructionExtNv.CondFlowEnable) {
1531      uint swizzle[4];
1532      uint chan_index;
1533
1534      kilmask = 0x0;
1535
1536      swizzle[0] = inst->InstructionExtNv.CondSwizzleX;
1537      swizzle[1] = inst->InstructionExtNv.CondSwizzleY;
1538      swizzle[2] = inst->InstructionExtNv.CondSwizzleZ;
1539      swizzle[3] = inst->InstructionExtNv.CondSwizzleW;
1540
1541      for (chan_index = 0; chan_index < 4; chan_index++)
1542      {
1543         uint i;
1544
1545         for (i = 0; i < 4; i++) {
1546            /* TODO: evaluate the condition code */
1547            if (0)
1548               kilmask |= 1 << i;
1549         }
1550      }
1551   }
1552   else {
1553      /* "unconditional" kil */
1554      kilmask = mach->ExecMask;
1555   }
1556   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1557}
1558
1559
1560/*
1561 * Fetch a four texture samples using STR texture coordinates.
1562 */
1563static void
1564fetch_texel( struct tgsi_sampler *sampler,
1565             const union tgsi_exec_channel *s,
1566             const union tgsi_exec_channel *t,
1567             const union tgsi_exec_channel *p,
1568             float lodbias,  /* XXX should be float[4] */
1569             union tgsi_exec_channel *r,
1570             union tgsi_exec_channel *g,
1571             union tgsi_exec_channel *b,
1572             union tgsi_exec_channel *a )
1573{
1574   uint j;
1575   float rgba[NUM_CHANNELS][QUAD_SIZE];
1576
1577   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
1578
1579   for (j = 0; j < 4; j++) {
1580      r->f[j] = rgba[0][j];
1581      g->f[j] = rgba[1][j];
1582      b->f[j] = rgba[2][j];
1583      a->f[j] = rgba[3][j];
1584   }
1585}
1586
1587
1588static void
1589exec_tex(struct tgsi_exec_machine *mach,
1590         const struct tgsi_full_instruction *inst,
1591         boolean biasLod,
1592         boolean projected)
1593{
1594   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
1595   union tgsi_exec_channel r[4];
1596   uint chan_index;
1597   float lodBias;
1598
1599   /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
1600
1601   switch (inst->InstructionExtTexture.Texture) {
1602   case TGSI_TEXTURE_1D:
1603   case TGSI_TEXTURE_SHADOW1D:
1604
1605      FETCH(&r[0], 0, CHAN_X);
1606
1607      if (projected) {
1608         FETCH(&r[1], 0, CHAN_W);
1609         micro_div( &r[0], &r[0], &r[1] );
1610      }
1611
1612      if (biasLod) {
1613         FETCH(&r[1], 0, CHAN_W);
1614         lodBias = r[2].f[0];
1615      }
1616      else
1617         lodBias = 0.0;
1618
1619      fetch_texel(mach->Samplers[unit],
1620                  &r[0], NULL, NULL, lodBias,  /* S, T, P, BIAS */
1621                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
1622      break;
1623
1624   case TGSI_TEXTURE_2D:
1625   case TGSI_TEXTURE_RECT:
1626   case TGSI_TEXTURE_SHADOW2D:
1627   case TGSI_TEXTURE_SHADOWRECT:
1628
1629      FETCH(&r[0], 0, CHAN_X);
1630      FETCH(&r[1], 0, CHAN_Y);
1631      FETCH(&r[2], 0, CHAN_Z);
1632
1633      if (projected) {
1634         FETCH(&r[3], 0, CHAN_W);
1635         micro_div( &r[0], &r[0], &r[3] );
1636         micro_div( &r[1], &r[1], &r[3] );
1637         micro_div( &r[2], &r[2], &r[3] );
1638      }
1639
1640      if (biasLod) {
1641         FETCH(&r[3], 0, CHAN_W);
1642         lodBias = r[3].f[0];
1643      }
1644      else
1645         lodBias = 0.0;
1646
1647      fetch_texel(mach->Samplers[unit],
1648                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
1649                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1650      break;
1651
1652   case TGSI_TEXTURE_3D:
1653   case TGSI_TEXTURE_CUBE:
1654
1655      FETCH(&r[0], 0, CHAN_X);
1656      FETCH(&r[1], 0, CHAN_Y);
1657      FETCH(&r[2], 0, CHAN_Z);
1658
1659      if (projected) {
1660         FETCH(&r[3], 0, CHAN_W);
1661         micro_div( &r[0], &r[0], &r[3] );
1662         micro_div( &r[1], &r[1], &r[3] );
1663         micro_div( &r[2], &r[2], &r[3] );
1664      }
1665
1666      if (biasLod) {
1667         FETCH(&r[3], 0, CHAN_W);
1668         lodBias = r[3].f[0];
1669      }
1670      else
1671         lodBias = 0.0;
1672
1673      fetch_texel(mach->Samplers[unit],
1674                  &r[0], &r[1], &r[2], lodBias,
1675                  &r[0], &r[1], &r[2], &r[3]);
1676      break;
1677
1678   default:
1679      assert (0);
1680   }
1681
1682   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1683      STORE( &r[chan_index], 0, chan_index );
1684   }
1685}
1686
1687
1688/**
1689 * Evaluate a constant-valued coefficient at the position of the
1690 * current quad.
1691 */
1692static void
1693eval_constant_coef(
1694   struct tgsi_exec_machine *mach,
1695   unsigned attrib,
1696   unsigned chan )
1697{
1698   unsigned i;
1699
1700   for( i = 0; i < QUAD_SIZE; i++ ) {
1701      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1702   }
1703}
1704
1705/**
1706 * Evaluate a linear-valued coefficient at the position of the
1707 * current quad.
1708 */
1709static void
1710eval_linear_coef(
1711   struct tgsi_exec_machine *mach,
1712   unsigned attrib,
1713   unsigned chan )
1714{
1715   const float x = mach->QuadPos.xyzw[0].f[0];
1716   const float y = mach->QuadPos.xyzw[1].f[0];
1717   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1718   const float dady = mach->InterpCoefs[attrib].dady[chan];
1719   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1720   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1721   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1722   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1723   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1724}
1725
1726/**
1727 * Evaluate a perspective-valued coefficient at the position of the
1728 * current quad.
1729 */
1730static void
1731eval_perspective_coef(
1732   struct tgsi_exec_machine *mach,
1733   unsigned attrib,
1734   unsigned chan )
1735{
1736   const float x = mach->QuadPos.xyzw[0].f[0];
1737   const float y = mach->QuadPos.xyzw[1].f[0];
1738   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1739   const float dady = mach->InterpCoefs[attrib].dady[chan];
1740   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1741   const float *w = mach->QuadPos.xyzw[3].f;
1742   /* divide by W here */
1743   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1744   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1745   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1746   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1747}
1748
1749
1750typedef void (* eval_coef_func)(
1751   struct tgsi_exec_machine *mach,
1752   unsigned attrib,
1753   unsigned chan );
1754
1755static void
1756exec_declaration(
1757   struct tgsi_exec_machine *mach,
1758   const struct tgsi_full_declaration *decl )
1759{
1760   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
1761      if( decl->Declaration.File == TGSI_FILE_INPUT ) {
1762         unsigned first, last, mask;
1763         eval_coef_func eval;
1764
1765         first = decl->DeclarationRange.First;
1766         last = decl->DeclarationRange.Last;
1767         mask = decl->Declaration.UsageMask;
1768
1769         switch( decl->Declaration.Interpolate ) {
1770         case TGSI_INTERPOLATE_CONSTANT:
1771            eval = eval_constant_coef;
1772            break;
1773
1774         case TGSI_INTERPOLATE_LINEAR:
1775            eval = eval_linear_coef;
1776            break;
1777
1778         case TGSI_INTERPOLATE_PERSPECTIVE:
1779            eval = eval_perspective_coef;
1780            break;
1781
1782         default:
1783            eval = NULL;
1784            assert( 0 );
1785         }
1786
1787         if( mask == TGSI_WRITEMASK_XYZW ) {
1788            unsigned i, j;
1789
1790            for( i = first; i <= last; i++ ) {
1791               for( j = 0; j < NUM_CHANNELS; j++ ) {
1792                  eval( mach, i, j );
1793               }
1794            }
1795         }
1796         else {
1797            unsigned i, j;
1798
1799            for( j = 0; j < NUM_CHANNELS; j++ ) {
1800               if( mask & (1 << j) ) {
1801                  for( i = first; i <= last; i++ ) {
1802                     eval( mach, i, j );
1803                  }
1804               }
1805            }
1806         }
1807      }
1808   }
1809}
1810
1811static void
1812exec_instruction(
1813   struct tgsi_exec_machine *mach,
1814   const struct tgsi_full_instruction *inst,
1815   int *pc )
1816{
1817   uint chan_index;
1818   union tgsi_exec_channel r[8];
1819
1820   (*pc)++;
1821
1822   switch (inst->Instruction.Opcode) {
1823   case TGSI_OPCODE_ARL:
1824   /* TGSI_OPCODE_FLOOR */
1825   /* TGSI_OPCODE_FLR */
1826      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1827         FETCH( &r[0], 0, chan_index );
1828         micro_flr( &r[0], &r[0] );
1829         STORE( &r[0], 0, chan_index );
1830      }
1831      break;
1832
1833   case TGSI_OPCODE_MOV:
1834   case TGSI_OPCODE_SWZ:
1835      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1836         FETCH( &r[0], 0, chan_index );
1837         STORE( &r[0], 0, chan_index );
1838      }
1839      break;
1840
1841   case TGSI_OPCODE_LIT:
1842      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1843         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
1844      }
1845
1846      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1847         FETCH( &r[0], 0, CHAN_X );
1848         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1849            micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1850            STORE( &r[0], 0, CHAN_Y );
1851         }
1852
1853         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1854            FETCH( &r[1], 0, CHAN_Y );
1855            micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1856
1857            FETCH( &r[2], 0, CHAN_W );
1858            micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
1859            micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
1860            micro_pow( &r[1], &r[1], &r[2] );
1861            micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1862            STORE( &r[0], 0, CHAN_Z );
1863         }
1864      }
1865
1866      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1867         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1868      }
1869      break;
1870
1871   case TGSI_OPCODE_RCP:
1872   /* TGSI_OPCODE_RECIP */
1873      FETCH( &r[0], 0, CHAN_X );
1874      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
1875      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1876         STORE( &r[0], 0, chan_index );
1877      }
1878      break;
1879
1880   case TGSI_OPCODE_RSQ:
1881   /* TGSI_OPCODE_RECIPSQRT */
1882      FETCH( &r[0], 0, CHAN_X );
1883      micro_abs( &r[0], &r[0] );
1884      micro_sqrt( &r[0], &r[0] );
1885      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
1886      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1887         STORE( &r[0], 0, chan_index );
1888      }
1889      break;
1890
1891   case TGSI_OPCODE_EXP:
1892      FETCH( &r[0], 0, CHAN_X );
1893      micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
1894      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1895         micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
1896         STORE( &r[2], 0, CHAN_X );        /* store r2 */
1897      }
1898      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1899         micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
1900         STORE( &r[2], 0, CHAN_Y );        /* store r2 */
1901      }
1902      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1903         micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
1904         STORE( &r[2], 0, CHAN_Z );        /* store r2 */
1905      }
1906      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1907         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1908      }
1909      break;
1910
1911   case TGSI_OPCODE_LOG:
1912      FETCH( &r[0], 0, CHAN_X );
1913      micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
1914      micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
1915      micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
1916      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1917         STORE( &r[0], 0, CHAN_X );
1918      }
1919      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1920         micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
1921         micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
1922         STORE( &r[0], 0, CHAN_Y );
1923      }
1924      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1925         STORE( &r[1], 0, CHAN_Z );
1926      }
1927      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1928         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1929      }
1930      break;
1931
1932   case TGSI_OPCODE_MUL:
1933      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
1934      {
1935         FETCH(&r[0], 0, chan_index);
1936         FETCH(&r[1], 1, chan_index);
1937
1938         micro_mul( &r[0], &r[0], &r[1] );
1939
1940         STORE(&r[0], 0, chan_index);
1941      }
1942      break;
1943
1944   case TGSI_OPCODE_ADD:
1945      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1946         FETCH( &r[0], 0, chan_index );
1947         FETCH( &r[1], 1, chan_index );
1948         micro_add( &r[0], &r[0], &r[1] );
1949         STORE( &r[0], 0, chan_index );
1950      }
1951      break;
1952
1953   case TGSI_OPCODE_DP3:
1954   /* TGSI_OPCODE_DOT3 */
1955      FETCH( &r[0], 0, CHAN_X );
1956      FETCH( &r[1], 1, CHAN_X );
1957      micro_mul( &r[0], &r[0], &r[1] );
1958
1959      FETCH( &r[1], 0, CHAN_Y );
1960      FETCH( &r[2], 1, CHAN_Y );
1961      micro_mul( &r[1], &r[1], &r[2] );
1962      micro_add( &r[0], &r[0], &r[1] );
1963
1964      FETCH( &r[1], 0, CHAN_Z );
1965      FETCH( &r[2], 1, CHAN_Z );
1966      micro_mul( &r[1], &r[1], &r[2] );
1967      micro_add( &r[0], &r[0], &r[1] );
1968
1969      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1970         STORE( &r[0], 0, chan_index );
1971      }
1972      break;
1973
1974    case TGSI_OPCODE_DP4:
1975    /* TGSI_OPCODE_DOT4 */
1976       FETCH(&r[0], 0, CHAN_X);
1977       FETCH(&r[1], 1, CHAN_X);
1978
1979       micro_mul( &r[0], &r[0], &r[1] );
1980
1981       FETCH(&r[1], 0, CHAN_Y);
1982       FETCH(&r[2], 1, CHAN_Y);
1983
1984       micro_mul( &r[1], &r[1], &r[2] );
1985       micro_add( &r[0], &r[0], &r[1] );
1986
1987       FETCH(&r[1], 0, CHAN_Z);
1988       FETCH(&r[2], 1, CHAN_Z);
1989
1990       micro_mul( &r[1], &r[1], &r[2] );
1991       micro_add( &r[0], &r[0], &r[1] );
1992
1993       FETCH(&r[1], 0, CHAN_W);
1994       FETCH(&r[2], 1, CHAN_W);
1995
1996       micro_mul( &r[1], &r[1], &r[2] );
1997       micro_add( &r[0], &r[0], &r[1] );
1998
1999      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2000         STORE( &r[0], 0, chan_index );
2001      }
2002      break;
2003
2004   case TGSI_OPCODE_DST:
2005      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2006         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2007      }
2008
2009      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2010         FETCH( &r[0], 0, CHAN_Y );
2011         FETCH( &r[1], 1, CHAN_Y);
2012         micro_mul( &r[0], &r[0], &r[1] );
2013         STORE( &r[0], 0, CHAN_Y );
2014      }
2015
2016      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2017         FETCH( &r[0], 0, CHAN_Z );
2018         STORE( &r[0], 0, CHAN_Z );
2019      }
2020
2021      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2022         FETCH( &r[0], 1, CHAN_W );
2023         STORE( &r[0], 0, CHAN_W );
2024      }
2025      break;
2026
2027   case TGSI_OPCODE_MIN:
2028      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2029         FETCH(&r[0], 0, chan_index);
2030         FETCH(&r[1], 1, chan_index);
2031
2032         /* XXX use micro_min()?? */
2033         micro_lt( &r[0], &r[0], &r[1], &r[0], &r[1] );
2034
2035         STORE(&r[0], 0, chan_index);
2036      }
2037      break;
2038
2039   case TGSI_OPCODE_MAX:
2040      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2041         FETCH(&r[0], 0, chan_index);
2042         FETCH(&r[1], 1, chan_index);
2043
2044         /* XXX use micro_max()?? */
2045         micro_lt( &r[0], &r[0], &r[1], &r[1], &r[0] );
2046
2047         STORE(&r[0], 0, chan_index );
2048      }
2049      break;
2050
2051   case TGSI_OPCODE_SLT:
2052   /* TGSI_OPCODE_SETLT */
2053      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2054         FETCH( &r[0], 0, chan_index );
2055         FETCH( &r[1], 1, chan_index );
2056         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2057         STORE( &r[0], 0, chan_index );
2058      }
2059      break;
2060
2061   case TGSI_OPCODE_SGE:
2062   /* TGSI_OPCODE_SETGE */
2063      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2064         FETCH( &r[0], 0, chan_index );
2065         FETCH( &r[1], 1, chan_index );
2066         micro_le( &r[0], &r[1], &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2067         STORE( &r[0], 0, chan_index );
2068      }
2069      break;
2070
2071   case TGSI_OPCODE_MAD:
2072   /* TGSI_OPCODE_MADD */
2073      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2074         FETCH( &r[0], 0, chan_index );
2075         FETCH( &r[1], 1, chan_index );
2076         micro_mul( &r[0], &r[0], &r[1] );
2077         FETCH( &r[1], 2, chan_index );
2078         micro_add( &r[0], &r[0], &r[1] );
2079         STORE( &r[0], 0, chan_index );
2080      }
2081      break;
2082
2083   case TGSI_OPCODE_SUB:
2084      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2085         FETCH(&r[0], 0, chan_index);
2086         FETCH(&r[1], 1, chan_index);
2087
2088         micro_sub( &r[0], &r[0], &r[1] );
2089
2090         STORE(&r[0], 0, chan_index);
2091      }
2092      break;
2093
2094   case TGSI_OPCODE_LERP:
2095   /* TGSI_OPCODE_LRP */
2096      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2097         FETCH(&r[0], 0, chan_index);
2098         FETCH(&r[1], 1, chan_index);
2099         FETCH(&r[2], 2, chan_index);
2100
2101         micro_sub( &r[1], &r[1], &r[2] );
2102         micro_mul( &r[0], &r[0], &r[1] );
2103         micro_add( &r[0], &r[0], &r[2] );
2104
2105         STORE(&r[0], 0, chan_index);
2106      }
2107      break;
2108
2109   case TGSI_OPCODE_CND:
2110      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2111         FETCH(&r[0], 0, chan_index);
2112         FETCH(&r[1], 1, chan_index);
2113         FETCH(&r[2], 2, chan_index);
2114         micro_lt(&r[0], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]);
2115         STORE(&r[0], 0, chan_index);
2116      }
2117      break;
2118
2119   case TGSI_OPCODE_CND0:
2120      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2121         FETCH(&r[0], 0, chan_index);
2122         FETCH(&r[1], 1, chan_index);
2123         FETCH(&r[2], 2, chan_index);
2124         micro_le(&r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[2], &r[0], &r[1]);
2125         STORE(&r[0], 0, chan_index);
2126      }
2127      break;
2128
2129   case TGSI_OPCODE_DOT2ADD:
2130   /* TGSI_OPCODE_DP2A */
2131      FETCH( &r[0], 0, CHAN_X );
2132      FETCH( &r[1], 1, CHAN_X );
2133      micro_mul( &r[0], &r[0], &r[1] );
2134
2135      FETCH( &r[1], 0, CHAN_Y );
2136      FETCH( &r[2], 1, CHAN_Y );
2137      micro_mul( &r[1], &r[1], &r[2] );
2138      micro_add( &r[0], &r[0], &r[1] );
2139
2140      FETCH( &r[2], 2, CHAN_X );
2141      micro_add( &r[0], &r[0], &r[2] );
2142
2143      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2144         STORE( &r[0], 0, chan_index );
2145      }
2146      break;
2147
2148   case TGSI_OPCODE_INDEX:
2149      /* XXX: considered for removal */
2150      assert (0);
2151      break;
2152
2153   case TGSI_OPCODE_NEGATE:
2154      /* XXX: considered for removal */
2155      assert (0);
2156      break;
2157
2158   case TGSI_OPCODE_FRAC:
2159   /* TGSI_OPCODE_FRC */
2160      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2161         FETCH( &r[0], 0, chan_index );
2162         micro_frc( &r[0], &r[0] );
2163         STORE( &r[0], 0, chan_index );
2164      }
2165      break;
2166
2167   case TGSI_OPCODE_CLAMP:
2168      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2169         FETCH(&r[0], 0, chan_index);
2170         FETCH(&r[1], 1, chan_index);
2171         micro_max(&r[0], &r[0], &r[1]);
2172         FETCH(&r[1], 2, chan_index);
2173         micro_min(&r[0], &r[0], &r[1]);
2174         STORE(&r[0], 0, chan_index);
2175      }
2176      break;
2177
2178   case TGSI_OPCODE_ROUND:
2179   case TGSI_OPCODE_ARR:
2180      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2181         FETCH( &r[0], 0, chan_index );
2182         micro_rnd( &r[0], &r[0] );
2183         STORE( &r[0], 0, chan_index );
2184      }
2185      break;
2186
2187   case TGSI_OPCODE_EXPBASE2:
2188   /* TGSI_OPCODE_EX2 */
2189      FETCH(&r[0], 0, CHAN_X);
2190
2191#if FAST_MATH
2192      micro_exp2( &r[0], &r[0] );
2193#else
2194      micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] );
2195#endif
2196
2197      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2198         STORE( &r[0], 0, chan_index );
2199      }
2200      break;
2201
2202   case TGSI_OPCODE_LOGBASE2:
2203   /* TGSI_OPCODE_LG2 */
2204      FETCH( &r[0], 0, CHAN_X );
2205      micro_lg2( &r[0], &r[0] );
2206      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2207         STORE( &r[0], 0, chan_index );
2208      }
2209      break;
2210
2211   case TGSI_OPCODE_POWER:
2212   /* TGSI_OPCODE_POW */
2213      FETCH(&r[0], 0, CHAN_X);
2214      FETCH(&r[1], 1, CHAN_X);
2215
2216      micro_pow( &r[0], &r[0], &r[1] );
2217
2218      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2219         STORE( &r[0], 0, chan_index );
2220      }
2221      break;
2222
2223   case TGSI_OPCODE_CROSSPRODUCT:
2224   /* TGSI_OPCODE_XPD */
2225      FETCH(&r[0], 0, CHAN_Y);
2226      FETCH(&r[1], 1, CHAN_Z);
2227
2228      micro_mul( &r[2], &r[0], &r[1] );
2229
2230      FETCH(&r[3], 0, CHAN_Z);
2231      FETCH(&r[4], 1, CHAN_Y);
2232
2233      micro_mul( &r[5], &r[3], &r[4] );
2234      micro_sub( &r[2], &r[2], &r[5] );
2235
2236      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2237         STORE( &r[2], 0, CHAN_X );
2238      }
2239
2240      FETCH(&r[2], 1, CHAN_X);
2241
2242      micro_mul( &r[3], &r[3], &r[2] );
2243
2244      FETCH(&r[5], 0, CHAN_X);
2245
2246      micro_mul( &r[1], &r[1], &r[5] );
2247      micro_sub( &r[3], &r[3], &r[1] );
2248
2249      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2250         STORE( &r[3], 0, CHAN_Y );
2251      }
2252
2253      micro_mul( &r[5], &r[5], &r[4] );
2254      micro_mul( &r[0], &r[0], &r[2] );
2255      micro_sub( &r[5], &r[5], &r[0] );
2256
2257      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2258         STORE( &r[5], 0, CHAN_Z );
2259      }
2260
2261      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2262         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2263      }
2264      break;
2265
2266    case TGSI_OPCODE_MULTIPLYMATRIX:
2267       /* XXX: considered for removal */
2268       assert (0);
2269       break;
2270
2271    case TGSI_OPCODE_ABS:
2272       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2273          FETCH(&r[0], 0, chan_index);
2274
2275          micro_abs( &r[0], &r[0] );
2276
2277          STORE(&r[0], 0, chan_index);
2278       }
2279       break;
2280
2281   case TGSI_OPCODE_RCC:
2282      assert (0);
2283      break;
2284
2285   case TGSI_OPCODE_DPH:
2286      FETCH(&r[0], 0, CHAN_X);
2287      FETCH(&r[1], 1, CHAN_X);
2288
2289      micro_mul( &r[0], &r[0], &r[1] );
2290
2291      FETCH(&r[1], 0, CHAN_Y);
2292      FETCH(&r[2], 1, CHAN_Y);
2293
2294      micro_mul( &r[1], &r[1], &r[2] );
2295      micro_add( &r[0], &r[0], &r[1] );
2296
2297      FETCH(&r[1], 0, CHAN_Z);
2298      FETCH(&r[2], 1, CHAN_Z);
2299
2300      micro_mul( &r[1], &r[1], &r[2] );
2301      micro_add( &r[0], &r[0], &r[1] );
2302
2303      FETCH(&r[1], 1, CHAN_W);
2304
2305      micro_add( &r[0], &r[0], &r[1] );
2306
2307      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2308         STORE( &r[0], 0, chan_index );
2309      }
2310      break;
2311
2312   case TGSI_OPCODE_COS:
2313      FETCH(&r[0], 0, CHAN_X);
2314
2315      micro_cos( &r[0], &r[0] );
2316
2317      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2318         STORE( &r[0], 0, chan_index );
2319      }
2320      break;
2321
2322   case TGSI_OPCODE_DDX:
2323      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2324         FETCH( &r[0], 0, chan_index );
2325         micro_ddx( &r[0], &r[0] );
2326         STORE( &r[0], 0, chan_index );
2327      }
2328      break;
2329
2330   case TGSI_OPCODE_DDY:
2331      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2332         FETCH( &r[0], 0, chan_index );
2333         micro_ddy( &r[0], &r[0] );
2334         STORE( &r[0], 0, chan_index );
2335      }
2336      break;
2337
2338   case TGSI_OPCODE_KILP:
2339      exec_kilp (mach, inst);
2340      break;
2341
2342   case TGSI_OPCODE_KIL:
2343      exec_kil (mach, inst);
2344      break;
2345
2346   case TGSI_OPCODE_PK2H:
2347      assert (0);
2348      break;
2349
2350   case TGSI_OPCODE_PK2US:
2351      assert (0);
2352      break;
2353
2354   case TGSI_OPCODE_PK4B:
2355      assert (0);
2356      break;
2357
2358   case TGSI_OPCODE_PK4UB:
2359      assert (0);
2360      break;
2361
2362   case TGSI_OPCODE_RFL:
2363      assert (0);
2364      break;
2365
2366   case TGSI_OPCODE_SEQ:
2367      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2368         FETCH( &r[0], 0, chan_index );
2369         FETCH( &r[1], 1, chan_index );
2370         micro_eq( &r[0], &r[0], &r[1],
2371                   &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C],
2372                   &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2373         STORE( &r[0], 0, chan_index );
2374      }
2375      break;
2376
2377   case TGSI_OPCODE_SFL:
2378      assert (0);
2379      break;
2380
2381   case TGSI_OPCODE_SGT:
2382      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2383         FETCH( &r[0], 0, chan_index );
2384         FETCH( &r[1], 1, chan_index );
2385         micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
2386         STORE( &r[0], 0, chan_index );
2387      }
2388      break;
2389
2390   case TGSI_OPCODE_SIN:
2391      FETCH( &r[0], 0, CHAN_X );
2392      micro_sin( &r[0], &r[0] );
2393      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2394         STORE( &r[0], 0, chan_index );
2395      }
2396      break;
2397
2398   case TGSI_OPCODE_SLE:
2399      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2400         FETCH( &r[0], 0, chan_index );
2401         FETCH( &r[1], 1, chan_index );
2402         micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2403         STORE( &r[0], 0, chan_index );
2404      }
2405      break;
2406
2407   case TGSI_OPCODE_SNE:
2408      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2409         FETCH( &r[0], 0, chan_index );
2410         FETCH( &r[1], 1, chan_index );
2411         micro_eq( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
2412         STORE( &r[0], 0, chan_index );
2413      }
2414      break;
2415
2416   case TGSI_OPCODE_STR:
2417      assert (0);
2418      break;
2419
2420   case TGSI_OPCODE_TEX:
2421      /* simple texture lookup */
2422      /* src[0] = texcoord */
2423      /* src[1] = sampler unit */
2424      exec_tex(mach, inst, FALSE, FALSE);
2425      break;
2426
2427   case TGSI_OPCODE_TXB:
2428      /* Texture lookup with lod bias */
2429      /* src[0] = texcoord (src[0].w = LOD bias) */
2430      /* src[1] = sampler unit */
2431      exec_tex(mach, inst, TRUE, FALSE);
2432      break;
2433
2434   case TGSI_OPCODE_TXD:
2435      /* Texture lookup with explict partial derivatives */
2436      /* src[0] = texcoord */
2437      /* src[1] = d[strq]/dx */
2438      /* src[2] = d[strq]/dy */
2439      /* src[3] = sampler unit */
2440      assert (0);
2441      break;
2442
2443   case TGSI_OPCODE_TXL:
2444      /* Texture lookup with explit LOD */
2445      /* src[0] = texcoord (src[0].w = LOD) */
2446      /* src[1] = sampler unit */
2447      exec_tex(mach, inst, TRUE, FALSE);
2448      break;
2449
2450   case TGSI_OPCODE_TXP:
2451      /* Texture lookup with projection */
2452      /* src[0] = texcoord (src[0].w = projection) */
2453      /* src[1] = sampler unit */
2454      exec_tex(mach, inst, FALSE, TRUE);
2455      break;
2456
2457   case TGSI_OPCODE_UP2H:
2458      assert (0);
2459      break;
2460
2461   case TGSI_OPCODE_UP2US:
2462      assert (0);
2463      break;
2464
2465   case TGSI_OPCODE_UP4B:
2466      assert (0);
2467      break;
2468
2469   case TGSI_OPCODE_UP4UB:
2470      assert (0);
2471      break;
2472
2473   case TGSI_OPCODE_X2D:
2474      assert (0);
2475      break;
2476
2477   case TGSI_OPCODE_ARA:
2478      assert (0);
2479      break;
2480
2481   case TGSI_OPCODE_BRA:
2482      assert (0);
2483      break;
2484
2485   case TGSI_OPCODE_CAL:
2486      /* skip the call if no execution channels are enabled */
2487      if (mach->ExecMask) {
2488         /* do the call */
2489
2490         /* push the Cond, Loop, Cont stacks */
2491         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2492         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2493         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2494         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2495         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2496         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2497
2498         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
2499         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
2500
2501         /* note that PC was already incremented above */
2502         mach->CallStack[mach->CallStackTop++] = *pc;
2503         *pc = inst->InstructionExtLabel.Label;
2504      }
2505      break;
2506
2507   case TGSI_OPCODE_RET:
2508      mach->FuncMask &= ~mach->ExecMask;
2509      UPDATE_EXEC_MASK(mach);
2510
2511      if (mach->FuncMask == 0x0) {
2512         /* really return now (otherwise, keep executing */
2513
2514         if (mach->CallStackTop == 0) {
2515            /* returning from main() */
2516            *pc = -1;
2517            return;
2518         }
2519         *pc = mach->CallStack[--mach->CallStackTop];
2520
2521         /* pop the Cond, Loop, Cont stacks */
2522         assert(mach->CondStackTop > 0);
2523         mach->CondMask = mach->CondStack[--mach->CondStackTop];
2524         assert(mach->LoopStackTop > 0);
2525         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
2526         assert(mach->ContStackTop > 0);
2527         mach->ContMask = mach->ContStack[--mach->ContStackTop];
2528         assert(mach->FuncStackTop > 0);
2529         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
2530
2531         UPDATE_EXEC_MASK(mach);
2532      }
2533      break;
2534
2535   case TGSI_OPCODE_SSG:
2536   /* TGSI_OPCODE_SGN */
2537      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2538         FETCH( &r[0], 0, chan_index );
2539         micro_sgn( &r[0], &r[0] );
2540         STORE( &r[0], 0, chan_index );
2541      }
2542      break;
2543
2544   case TGSI_OPCODE_CMP:
2545      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2546         FETCH(&r[0], 0, chan_index);
2547         FETCH(&r[1], 1, chan_index);
2548         FETCH(&r[2], 2, chan_index);
2549
2550         micro_lt( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2] );
2551
2552         STORE(&r[0], 0, chan_index);
2553      }
2554      break;
2555
2556   case TGSI_OPCODE_SCS:
2557      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2558         FETCH( &r[0], 0, CHAN_X );
2559      }
2560      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
2561         micro_cos( &r[1], &r[0] );
2562         STORE( &r[1], 0, CHAN_X );
2563      }
2564      if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2565         micro_sin( &r[1], &r[0] );
2566         STORE( &r[1], 0, CHAN_Y );
2567      }
2568      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2569         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
2570      }
2571      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
2572         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2573      }
2574      break;
2575
2576   case TGSI_OPCODE_NRM:
2577      /* 3-component vector normalize */
2578      {
2579         union tgsi_exec_channel tmp, dot;
2580
2581         /* tmp = dp3(src0, src0): */
2582         FETCH( &r[0], 0, CHAN_X );
2583         micro_mul( &tmp, &r[0], &r[0] );
2584
2585         FETCH( &r[1], 0, CHAN_Y );
2586         micro_mul( &dot, &r[1], &r[1] );
2587         micro_add( &tmp, &tmp, &dot );
2588
2589         FETCH( &r[2], 0, CHAN_Z );
2590         micro_mul( &dot, &r[2], &r[2] );
2591         micro_add( &tmp, &tmp, &dot );
2592
2593         /* tmp = 1 / sqrt(tmp) */
2594         micro_sqrt( &tmp, &tmp );
2595         micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
2596
2597         /* note: w channel is undefined */
2598         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2599            /* chan = chan * tmp */
2600            micro_mul( &r[chan_index], &tmp, &r[chan_index] );
2601            STORE( &r[chan_index], 0, chan_index );
2602         }
2603      }
2604      break;
2605
2606   case TGSI_OPCODE_NRM4:
2607      /* 4-component vector normalize */
2608      {
2609         union tgsi_exec_channel tmp, dot;
2610
2611         /* tmp = dp4(src0, src0): */
2612         FETCH( &r[0], 0, CHAN_X );
2613         micro_mul( &tmp, &r[0], &r[0] );
2614
2615         FETCH( &r[1], 0, CHAN_Y );
2616         micro_mul( &dot, &r[1], &r[1] );
2617         micro_add( &tmp, &tmp, &dot );
2618
2619         FETCH( &r[2], 0, CHAN_Z );
2620         micro_mul( &dot, &r[2], &r[2] );
2621         micro_add( &tmp, &tmp, &dot );
2622
2623         FETCH( &r[3], 0, CHAN_W );
2624         micro_mul( &dot, &r[3], &r[3] );
2625         micro_add( &tmp, &tmp, &dot );
2626
2627         /* tmp = 1 / sqrt(tmp) */
2628         micro_sqrt( &tmp, &tmp );
2629         micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
2630
2631         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2632            /* chan = chan * tmp */
2633            micro_mul( &r[chan_index], &tmp, &r[chan_index] );
2634            STORE( &r[chan_index], 0, chan_index );
2635         }
2636      }
2637      break;
2638
2639   case TGSI_OPCODE_DIV:
2640      assert( 0 );
2641      break;
2642
2643   case TGSI_OPCODE_DP2:
2644      FETCH( &r[0], 0, CHAN_X );
2645      FETCH( &r[1], 1, CHAN_X );
2646      micro_mul( &r[0], &r[0], &r[1] );
2647
2648      FETCH( &r[1], 0, CHAN_Y );
2649      FETCH( &r[2], 1, CHAN_Y );
2650      micro_mul( &r[1], &r[1], &r[2] );
2651      micro_add( &r[0], &r[0], &r[1] );
2652
2653      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2654         STORE( &r[0], 0, chan_index );
2655      }
2656      break;
2657
2658   case TGSI_OPCODE_IF:
2659      /* push CondMask */
2660      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2661      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2662      FETCH( &r[0], 0, CHAN_X );
2663      /* update CondMask */
2664      if( ! r[0].u[0] ) {
2665         mach->CondMask &= ~0x1;
2666      }
2667      if( ! r[0].u[1] ) {
2668         mach->CondMask &= ~0x2;
2669      }
2670      if( ! r[0].u[2] ) {
2671         mach->CondMask &= ~0x4;
2672      }
2673      if( ! r[0].u[3] ) {
2674         mach->CondMask &= ~0x8;
2675      }
2676      UPDATE_EXEC_MASK(mach);
2677      /* Todo: If CondMask==0, jump to ELSE */
2678      break;
2679
2680   case TGSI_OPCODE_ELSE:
2681      /* invert CondMask wrt previous mask */
2682      {
2683         uint prevMask;
2684         assert(mach->CondStackTop > 0);
2685         prevMask = mach->CondStack[mach->CondStackTop - 1];
2686         mach->CondMask = ~mach->CondMask & prevMask;
2687         UPDATE_EXEC_MASK(mach);
2688         /* Todo: If CondMask==0, jump to ENDIF */
2689      }
2690      break;
2691
2692   case TGSI_OPCODE_ENDIF:
2693      /* pop CondMask */
2694      assert(mach->CondStackTop > 0);
2695      mach->CondMask = mach->CondStack[--mach->CondStackTop];
2696      UPDATE_EXEC_MASK(mach);
2697      break;
2698
2699   case TGSI_OPCODE_END:
2700      /* halt execution */
2701      *pc = -1;
2702      break;
2703
2704   case TGSI_OPCODE_REP:
2705      assert (0);
2706      break;
2707
2708   case TGSI_OPCODE_ENDREP:
2709       assert (0);
2710       break;
2711
2712   case TGSI_OPCODE_PUSHA:
2713      assert (0);
2714      break;
2715
2716   case TGSI_OPCODE_POPA:
2717      assert (0);
2718      break;
2719
2720   case TGSI_OPCODE_CEIL:
2721      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2722         FETCH( &r[0], 0, chan_index );
2723         micro_ceil( &r[0], &r[0] );
2724         STORE( &r[0], 0, chan_index );
2725      }
2726      break;
2727
2728   case TGSI_OPCODE_I2F:
2729      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2730         FETCH( &r[0], 0, chan_index );
2731         micro_i2f( &r[0], &r[0] );
2732         STORE( &r[0], 0, chan_index );
2733      }
2734      break;
2735
2736   case TGSI_OPCODE_NOT:
2737      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2738         FETCH( &r[0], 0, chan_index );
2739         micro_not( &r[0], &r[0] );
2740         STORE( &r[0], 0, chan_index );
2741      }
2742      break;
2743
2744   case TGSI_OPCODE_TRUNC:
2745      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2746         FETCH( &r[0], 0, chan_index );
2747         micro_trunc( &r[0], &r[0] );
2748         STORE( &r[0], 0, chan_index );
2749      }
2750      break;
2751
2752   case TGSI_OPCODE_SHL:
2753      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2754         FETCH( &r[0], 0, chan_index );
2755         FETCH( &r[1], 1, chan_index );
2756         micro_shl( &r[0], &r[0], &r[1] );
2757         STORE( &r[0], 0, chan_index );
2758      }
2759      break;
2760
2761   case TGSI_OPCODE_SHR:
2762      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2763         FETCH( &r[0], 0, chan_index );
2764         FETCH( &r[1], 1, chan_index );
2765         micro_ishr( &r[0], &r[0], &r[1] );
2766         STORE( &r[0], 0, chan_index );
2767      }
2768      break;
2769
2770   case TGSI_OPCODE_AND:
2771      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2772         FETCH( &r[0], 0, chan_index );
2773         FETCH( &r[1], 1, chan_index );
2774         micro_and( &r[0], &r[0], &r[1] );
2775         STORE( &r[0], 0, chan_index );
2776      }
2777      break;
2778
2779   case TGSI_OPCODE_OR:
2780      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2781         FETCH( &r[0], 0, chan_index );
2782         FETCH( &r[1], 1, chan_index );
2783         micro_or( &r[0], &r[0], &r[1] );
2784         STORE( &r[0], 0, chan_index );
2785      }
2786      break;
2787
2788   case TGSI_OPCODE_MOD:
2789      assert (0);
2790      break;
2791
2792   case TGSI_OPCODE_XOR:
2793      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2794         FETCH( &r[0], 0, chan_index );
2795         FETCH( &r[1], 1, chan_index );
2796         micro_xor( &r[0], &r[0], &r[1] );
2797         STORE( &r[0], 0, chan_index );
2798      }
2799      break;
2800
2801   case TGSI_OPCODE_SAD:
2802      assert (0);
2803      break;
2804
2805   case TGSI_OPCODE_TXF:
2806      assert (0);
2807      break;
2808
2809   case TGSI_OPCODE_TXQ:
2810      assert (0);
2811      break;
2812
2813   case TGSI_OPCODE_EMIT:
2814      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
2815      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
2816      break;
2817
2818   case TGSI_OPCODE_ENDPRIM:
2819      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
2820      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
2821      break;
2822
2823   case TGSI_OPCODE_LOOP:
2824      /* fall-through (for now) */
2825   case TGSI_OPCODE_BGNLOOP2:
2826      /* push LoopMask and ContMasks */
2827      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2828      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2829      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2830      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2831      break;
2832
2833   case TGSI_OPCODE_ENDLOOP:
2834      /* fall-through (for now at least) */
2835   case TGSI_OPCODE_ENDLOOP2:
2836      /* Restore ContMask, but don't pop */
2837      assert(mach->ContStackTop > 0);
2838      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
2839      UPDATE_EXEC_MASK(mach);
2840      if (mach->ExecMask) {
2841         /* repeat loop: jump to instruction just past BGNLOOP */
2842         *pc = inst->InstructionExtLabel.Label + 1;
2843      }
2844      else {
2845         /* exit loop: pop LoopMask */
2846         assert(mach->LoopStackTop > 0);
2847         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
2848         /* pop ContMask */
2849         assert(mach->ContStackTop > 0);
2850         mach->ContMask = mach->ContStack[--mach->ContStackTop];
2851      }
2852      UPDATE_EXEC_MASK(mach);
2853      break;
2854
2855   case TGSI_OPCODE_BRK:
2856      /* turn off loop channels for each enabled exec channel */
2857      mach->LoopMask &= ~mach->ExecMask;
2858      /* Todo: if mach->LoopMask == 0, jump to end of loop */
2859      UPDATE_EXEC_MASK(mach);
2860      break;
2861
2862   case TGSI_OPCODE_CONT:
2863      /* turn off cont channels for each enabled exec channel */
2864      mach->ContMask &= ~mach->ExecMask;
2865      /* Todo: if mach->LoopMask == 0, jump to end of loop */
2866      UPDATE_EXEC_MASK(mach);
2867      break;
2868
2869   case TGSI_OPCODE_BGNSUB:
2870      /* no-op */
2871      break;
2872
2873   case TGSI_OPCODE_ENDSUB:
2874      /* no-op */
2875      break;
2876
2877   case TGSI_OPCODE_NOISE1:
2878      assert( 0 );
2879      break;
2880
2881   case TGSI_OPCODE_NOISE2:
2882      assert( 0 );
2883      break;
2884
2885   case TGSI_OPCODE_NOISE3:
2886      assert( 0 );
2887      break;
2888
2889   case TGSI_OPCODE_NOISE4:
2890      assert( 0 );
2891      break;
2892
2893   case TGSI_OPCODE_NOP:
2894      break;
2895
2896   default:
2897      assert( 0 );
2898   }
2899}
2900
2901
2902/**
2903 * Run TGSI interpreter.
2904 * \return bitmask of "alive" quad components
2905 */
2906uint
2907tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
2908{
2909   uint i;
2910   int pc = 0;
2911
2912   mach->CondMask = 0xf;
2913   mach->LoopMask = 0xf;
2914   mach->ContMask = 0xf;
2915   mach->FuncMask = 0xf;
2916   mach->ExecMask = 0xf;
2917
2918   mach->CondStackTop = 0; /* temporarily subvert this assertion */
2919   assert(mach->CondStackTop == 0);
2920   assert(mach->LoopStackTop == 0);
2921   assert(mach->ContStackTop == 0);
2922   assert(mach->CallStackTop == 0);
2923
2924   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
2925   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
2926
2927   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
2928      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
2929      mach->Primitives[0] = 0;
2930   }
2931
2932   for (i = 0; i < QUAD_SIZE; i++) {
2933      mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
2934         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
2935         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
2936         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
2937         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
2938   }
2939
2940   /* execute declarations (interpolants) */
2941   for (i = 0; i < mach->NumDeclarations; i++) {
2942      exec_declaration( mach, mach->Declarations+i );
2943   }
2944
2945   /* execute instructions, until pc is set to -1 */
2946   while (pc != -1) {
2947      assert(pc < (int) mach->NumInstructions);
2948      exec_instruction( mach, mach->Instructions + pc, &pc );
2949   }
2950
2951#if 0
2952   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
2953   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
2954      /*
2955       * Scale back depth component.
2956       */
2957      for (i = 0; i < 4; i++)
2958         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
2959   }
2960#endif
2961
2962   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
2963}
2964