tgsi_exec.c revision 101f792a2af9c9a19a050afba8b60caa689466a5
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29/**
30 * TGSI interpreter/executor.
31 *
32 * Flow control information:
33 *
34 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
35 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
36 * care since a condition may be true for some quad components but false
37 * for other components.
38 *
39 * We basically execute all statements (even if they're in the part of
40 * an IF/ELSE clause that's "not taken") and use a special mask to
41 * control writing to destination registers.  This is the ExecMask.
42 * See store_dest().
43 *
44 * The ExecMask is computed from three other masks (CondMask, LoopMask and
45 * ContMask) which are controlled by the flow control instructions (namely:
46 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
47 *
48 *
49 * Authors:
50 *   Michal Krol
51 *   Brian Paul
52 */
53
54#include "pipe/p_compiler.h"
55#include "pipe/p_state.h"
56#include "pipe/p_shader_tokens.h"
57#include "tgsi/tgsi_dump.h"
58#include "tgsi/tgsi_parse.h"
59#include "tgsi/tgsi_util.h"
60#include "tgsi_exec.h"
61#include "util/u_memory.h"
62#include "util/u_math.h"
63
64
65#define FAST_MATH 1
66
67#define TILE_TOP_LEFT     0
68#define TILE_TOP_RIGHT    1
69#define TILE_BOTTOM_LEFT  2
70#define TILE_BOTTOM_RIGHT 3
71
72static void
73micro_abs(union tgsi_exec_channel *dst,
74          const union tgsi_exec_channel *src)
75{
76   dst->f[0] = fabsf(src->f[0]);
77   dst->f[1] = fabsf(src->f[1]);
78   dst->f[2] = fabsf(src->f[2]);
79   dst->f[3] = fabsf(src->f[3]);
80}
81
82static void
83micro_arl(union tgsi_exec_channel *dst,
84          const union tgsi_exec_channel *src)
85{
86   dst->i[0] = (int)floorf(src->f[0]);
87   dst->i[1] = (int)floorf(src->f[1]);
88   dst->i[2] = (int)floorf(src->f[2]);
89   dst->i[3] = (int)floorf(src->f[3]);
90}
91
92static void
93micro_arr(union tgsi_exec_channel *dst,
94          const union tgsi_exec_channel *src)
95{
96   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
97   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
98   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
99   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
100}
101
102static void
103micro_ceil(union tgsi_exec_channel *dst,
104           const union tgsi_exec_channel *src)
105{
106   dst->f[0] = ceilf(src->f[0]);
107   dst->f[1] = ceilf(src->f[1]);
108   dst->f[2] = ceilf(src->f[2]);
109   dst->f[3] = ceilf(src->f[3]);
110}
111
112static void
113micro_clamp(union tgsi_exec_channel *dst,
114            const union tgsi_exec_channel *src0,
115            const union tgsi_exec_channel *src1,
116            const union tgsi_exec_channel *src2)
117{
118   dst->f[0] = src0->f[0] < src1->f[0] ? src1->f[0] : src0->f[0] > src2->f[0] ? src2->f[0] : src0->f[0];
119   dst->f[1] = src0->f[1] < src1->f[1] ? src1->f[1] : src0->f[1] > src2->f[1] ? src2->f[1] : src0->f[1];
120   dst->f[2] = src0->f[2] < src1->f[2] ? src1->f[2] : src0->f[2] > src2->f[2] ? src2->f[2] : src0->f[2];
121   dst->f[3] = src0->f[3] < src1->f[3] ? src1->f[3] : src0->f[3] > src2->f[3] ? src2->f[3] : src0->f[3];
122}
123
124static void
125micro_cmp(union tgsi_exec_channel *dst,
126          const union tgsi_exec_channel *src0,
127          const union tgsi_exec_channel *src1,
128          const union tgsi_exec_channel *src2)
129{
130   dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
131   dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
132   dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
133   dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
134}
135
136static void
137micro_cnd(union tgsi_exec_channel *dst,
138          const union tgsi_exec_channel *src0,
139          const union tgsi_exec_channel *src1,
140          const union tgsi_exec_channel *src2)
141{
142   dst->f[0] = src2->f[0] > 0.5f ? src0->f[0] : src1->f[0];
143   dst->f[1] = src2->f[1] > 0.5f ? src0->f[1] : src1->f[1];
144   dst->f[2] = src2->f[2] > 0.5f ? src0->f[2] : src1->f[2];
145   dst->f[3] = src2->f[3] > 0.5f ? src0->f[3] : src1->f[3];
146}
147
148static void
149micro_cos(union tgsi_exec_channel *dst,
150          const union tgsi_exec_channel *src)
151{
152   dst->f[0] = cosf(src->f[0]);
153   dst->f[1] = cosf(src->f[1]);
154   dst->f[2] = cosf(src->f[2]);
155   dst->f[3] = cosf(src->f[3]);
156}
157
158static void
159micro_ddx(union tgsi_exec_channel *dst,
160          const union tgsi_exec_channel *src)
161{
162   dst->f[0] =
163   dst->f[1] =
164   dst->f[2] =
165   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
166}
167
168static void
169micro_ddy(union tgsi_exec_channel *dst,
170          const union tgsi_exec_channel *src)
171{
172   dst->f[0] =
173   dst->f[1] =
174   dst->f[2] =
175   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
176}
177
178static void
179micro_exp2(union tgsi_exec_channel *dst,
180           const union tgsi_exec_channel *src)
181{
182#if FAST_MATH
183   dst->f[0] = util_fast_exp2(src->f[0]);
184   dst->f[1] = util_fast_exp2(src->f[1]);
185   dst->f[2] = util_fast_exp2(src->f[2]);
186   dst->f[3] = util_fast_exp2(src->f[3]);
187#else
188#if DEBUG
189   /* Inf is okay for this instruction, so clamp it to silence assertions. */
190   uint i;
191   union tgsi_exec_channel clamped;
192
193   for (i = 0; i < 4; i++) {
194      if (src->f[i] > 127.99999f) {
195         clamped.f[i] = 127.99999f;
196      } else if (src->f[i] < -126.99999f) {
197         clamped.f[i] = -126.99999f;
198      } else {
199         clamped.f[i] = src->f[i];
200      }
201   }
202   src = &clamped;
203#endif /* DEBUG */
204
205   dst->f[0] = powf(2.0f, src->f[0]);
206   dst->f[1] = powf(2.0f, src->f[1]);
207   dst->f[2] = powf(2.0f, src->f[2]);
208   dst->f[3] = powf(2.0f, src->f[3]);
209#endif /* FAST_MATH */
210}
211
212static void
213micro_flr(union tgsi_exec_channel *dst,
214          const union tgsi_exec_channel *src)
215{
216   dst->f[0] = floorf(src->f[0]);
217   dst->f[1] = floorf(src->f[1]);
218   dst->f[2] = floorf(src->f[2]);
219   dst->f[3] = floorf(src->f[3]);
220}
221
222static void
223micro_frc(union tgsi_exec_channel *dst,
224          const union tgsi_exec_channel *src)
225{
226   dst->f[0] = src->f[0] - floorf(src->f[0]);
227   dst->f[1] = src->f[1] - floorf(src->f[1]);
228   dst->f[2] = src->f[2] - floorf(src->f[2]);
229   dst->f[3] = src->f[3] - floorf(src->f[3]);
230}
231
232static void
233micro_iabs(union tgsi_exec_channel *dst,
234           const union tgsi_exec_channel *src)
235{
236   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
237   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
238   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
239   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
240}
241
242static void
243micro_ineg(union tgsi_exec_channel *dst,
244           const union tgsi_exec_channel *src)
245{
246   dst->i[0] = -src->i[0];
247   dst->i[1] = -src->i[1];
248   dst->i[2] = -src->i[2];
249   dst->i[3] = -src->i[3];
250}
251
252static void
253micro_lg2(union tgsi_exec_channel *dst,
254          const union tgsi_exec_channel *src)
255{
256#if FAST_MATH
257   dst->f[0] = util_fast_log2(src->f[0]);
258   dst->f[1] = util_fast_log2(src->f[1]);
259   dst->f[2] = util_fast_log2(src->f[2]);
260   dst->f[3] = util_fast_log2(src->f[3]);
261#else
262   dst->f[0] = logf(src->f[0]) * 1.442695f;
263   dst->f[1] = logf(src->f[1]) * 1.442695f;
264   dst->f[2] = logf(src->f[2]) * 1.442695f;
265   dst->f[3] = logf(src->f[3]) * 1.442695f;
266#endif
267}
268
269static void
270micro_lrp(union tgsi_exec_channel *dst,
271          const union tgsi_exec_channel *src0,
272          const union tgsi_exec_channel *src1,
273          const union tgsi_exec_channel *src2)
274{
275   dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
276   dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
277   dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
278   dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
279}
280
281static void
282micro_mad(union tgsi_exec_channel *dst,
283          const union tgsi_exec_channel *src0,
284          const union tgsi_exec_channel *src1,
285          const union tgsi_exec_channel *src2)
286{
287   dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
288   dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
289   dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
290   dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
291}
292
293static void
294micro_mov(union tgsi_exec_channel *dst,
295          const union tgsi_exec_channel *src)
296{
297   dst->u[0] = src->u[0];
298   dst->u[1] = src->u[1];
299   dst->u[2] = src->u[2];
300   dst->u[3] = src->u[3];
301}
302
303static void
304micro_rcp(union tgsi_exec_channel *dst,
305          const union tgsi_exec_channel *src)
306{
307#if 0 /* for debugging */
308   assert(src->f[0] != 0.0f);
309   assert(src->f[1] != 0.0f);
310   assert(src->f[2] != 0.0f);
311   assert(src->f[3] != 0.0f);
312#endif
313   dst->f[0] = 1.0f / src->f[0];
314   dst->f[1] = 1.0f / src->f[1];
315   dst->f[2] = 1.0f / src->f[2];
316   dst->f[3] = 1.0f / src->f[3];
317}
318
319static void
320micro_rnd(union tgsi_exec_channel *dst,
321          const union tgsi_exec_channel *src)
322{
323   dst->f[0] = floorf(src->f[0] + 0.5f);
324   dst->f[1] = floorf(src->f[1] + 0.5f);
325   dst->f[2] = floorf(src->f[2] + 0.5f);
326   dst->f[3] = floorf(src->f[3] + 0.5f);
327}
328
329static void
330micro_rsq(union tgsi_exec_channel *dst,
331          const union tgsi_exec_channel *src)
332{
333#if 0 /* for debugging */
334   assert(src->f[0] != 0.0f);
335   assert(src->f[1] != 0.0f);
336   assert(src->f[2] != 0.0f);
337   assert(src->f[3] != 0.0f);
338#endif
339   dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
340   dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
341   dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
342   dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
343}
344
345static void
346micro_seq(union tgsi_exec_channel *dst,
347          const union tgsi_exec_channel *src0,
348          const union tgsi_exec_channel *src1)
349{
350   dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
351   dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
352   dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
353   dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
354}
355
356static void
357micro_sge(union tgsi_exec_channel *dst,
358          const union tgsi_exec_channel *src0,
359          const union tgsi_exec_channel *src1)
360{
361   dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
362   dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
363   dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
364   dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
365}
366
367static void
368micro_sgn(union tgsi_exec_channel *dst,
369          const union tgsi_exec_channel *src)
370{
371   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
372   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
373   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
374   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
375}
376
377static void
378micro_sgt(union tgsi_exec_channel *dst,
379          const union tgsi_exec_channel *src0,
380          const union tgsi_exec_channel *src1)
381{
382   dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
383   dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
384   dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
385   dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
386}
387
388static void
389micro_sin(union tgsi_exec_channel *dst,
390          const union tgsi_exec_channel *src)
391{
392   dst->f[0] = sinf(src->f[0]);
393   dst->f[1] = sinf(src->f[1]);
394   dst->f[2] = sinf(src->f[2]);
395   dst->f[3] = sinf(src->f[3]);
396}
397
398static void
399micro_sle(union tgsi_exec_channel *dst,
400          const union tgsi_exec_channel *src0,
401          const union tgsi_exec_channel *src1)
402{
403   dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
404   dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
405   dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
406   dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
407}
408
409static void
410micro_slt(union tgsi_exec_channel *dst,
411          const union tgsi_exec_channel *src0,
412          const union tgsi_exec_channel *src1)
413{
414   dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
415   dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
416   dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
417   dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
418}
419
420static void
421micro_sne(union tgsi_exec_channel *dst,
422          const union tgsi_exec_channel *src0,
423          const union tgsi_exec_channel *src1)
424{
425   dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
426   dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
427   dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
428   dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
429}
430
431static void
432micro_trunc(union tgsi_exec_channel *dst,
433            const union tgsi_exec_channel *src)
434{
435   dst->f[0] = (float)(int)src->f[0];
436   dst->f[1] = (float)(int)src->f[1];
437   dst->f[2] = (float)(int)src->f[2];
438   dst->f[3] = (float)(int)src->f[3];
439}
440
441
442#define CHAN_X  0
443#define CHAN_Y  1
444#define CHAN_Z  2
445#define CHAN_W  3
446
447enum tgsi_exec_datatype {
448   TGSI_EXEC_DATA_FLOAT,
449   TGSI_EXEC_DATA_INT,
450   TGSI_EXEC_DATA_UINT
451};
452
453/*
454 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
455 */
456#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
457#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
458#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
459#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
460#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
461#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
462#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
463#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
464#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
465#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
466#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
467#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
468#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
469#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
470#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
471#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
472#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
473#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
474#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
475#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
476#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
477#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
478#define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
479#define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
480#define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
481#define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
482#define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
483#define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
484#define TEMP_R0            TGSI_EXEC_TEMP_R0
485#define TEMP_P0            TGSI_EXEC_TEMP_P0
486
487#define IS_CHANNEL_ENABLED(INST, CHAN)\
488   ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
489
490#define IS_CHANNEL_ENABLED2(INST, CHAN)\
491   ((INST).Dst[1].Register.WriteMask & (1 << (CHAN)))
492
493#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
494   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
495      if (IS_CHANNEL_ENABLED( INST, CHAN ))
496
497#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
498   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
499      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
500
501
502/** The execution mask depends on the conditional mask and the loop mask */
503#define UPDATE_EXEC_MASK(MACH) \
504      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
505
506
507static const union tgsi_exec_channel ZeroVec =
508   { { 0.0, 0.0, 0.0, 0.0 } };
509
510static const union tgsi_exec_channel OneVec = {
511   {1.0f, 1.0f, 1.0f, 1.0f}
512};
513
514
515/**
516 * Assert that none of the float values in 'chan' are infinite or NaN.
517 * NaN and Inf may occur normally during program execution and should
518 * not lead to crashes, etc.  But when debugging, it's helpful to catch
519 * them.
520 */
521static INLINE void
522check_inf_or_nan(const union tgsi_exec_channel *chan)
523{
524   assert(!util_is_inf_or_nan((chan)->f[0]));
525   assert(!util_is_inf_or_nan((chan)->f[1]));
526   assert(!util_is_inf_or_nan((chan)->f[2]));
527   assert(!util_is_inf_or_nan((chan)->f[3]));
528}
529
530
531#ifdef DEBUG
532static void
533print_chan(const char *msg, const union tgsi_exec_channel *chan)
534{
535   debug_printf("%s = {%f, %f, %f, %f}\n",
536                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
537}
538#endif
539
540
541#ifdef DEBUG
542static void
543print_temp(const struct tgsi_exec_machine *mach, uint index)
544{
545   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
546   int i;
547   debug_printf("Temp[%u] =\n", index);
548   for (i = 0; i < 4; i++) {
549      debug_printf("  %c: { %f, %f, %f, %f }\n",
550                   "XYZW"[i],
551                   tmp->xyzw[i].f[0],
552                   tmp->xyzw[i].f[1],
553                   tmp->xyzw[i].f[2],
554                   tmp->xyzw[i].f[3]);
555   }
556}
557#endif
558
559
560/**
561 * Check if there's a potential src/dst register data dependency when
562 * using SOA execution.
563 * Example:
564 *   MOV T, T.yxwz;
565 * This would expand into:
566 *   MOV t0, t1;
567 *   MOV t1, t0;
568 *   MOV t2, t3;
569 *   MOV t3, t2;
570 * The second instruction will have the wrong value for t0 if executed as-is.
571 */
572boolean
573tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
574{
575   uint i, chan;
576
577   uint writemask = inst->Dst[0].Register.WriteMask;
578   if (writemask == TGSI_WRITEMASK_X ||
579       writemask == TGSI_WRITEMASK_Y ||
580       writemask == TGSI_WRITEMASK_Z ||
581       writemask == TGSI_WRITEMASK_W ||
582       writemask == TGSI_WRITEMASK_NONE) {
583      /* no chance of data dependency */
584      return FALSE;
585   }
586
587   /* loop over src regs */
588   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
589      if ((inst->Src[i].Register.File ==
590           inst->Dst[0].Register.File) &&
591          (inst->Src[i].Register.Index ==
592           inst->Dst[0].Register.Index)) {
593         /* loop over dest channels */
594         uint channelsWritten = 0x0;
595         FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
596            /* check if we're reading a channel that's been written */
597            uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
598            if (channelsWritten & (1 << swizzle)) {
599               return TRUE;
600            }
601
602            channelsWritten |= (1 << chan);
603         }
604      }
605   }
606   return FALSE;
607}
608
609
610/**
611 * Initialize machine state by expanding tokens to full instructions,
612 * allocating temporary storage, setting up constants, etc.
613 * After this, we can call tgsi_exec_machine_run() many times.
614 */
615void
616tgsi_exec_machine_bind_shader(
617   struct tgsi_exec_machine *mach,
618   const struct tgsi_token *tokens,
619   uint numSamplers,
620   struct tgsi_sampler **samplers)
621{
622   uint k;
623   struct tgsi_parse_context parse;
624   struct tgsi_full_instruction *instructions;
625   struct tgsi_full_declaration *declarations;
626   uint maxInstructions = 10, numInstructions = 0;
627   uint maxDeclarations = 10, numDeclarations = 0;
628
629#if 0
630   tgsi_dump(tokens, 0);
631#endif
632
633   util_init_math();
634
635   mach->Tokens = tokens;
636   mach->Samplers = samplers;
637
638   if (!tokens) {
639      /* unbind and free all */
640      if (mach->Declarations) {
641         FREE( mach->Declarations );
642      }
643      mach->Declarations = NULL;
644      mach->NumDeclarations = 0;
645
646      if (mach->Instructions) {
647         FREE( mach->Instructions );
648      }
649      mach->Instructions = NULL;
650      mach->NumInstructions = 0;
651
652      return;
653   }
654
655   k = tgsi_parse_init (&parse, mach->Tokens);
656   if (k != TGSI_PARSE_OK) {
657      debug_printf( "Problem parsing!\n" );
658      return;
659   }
660
661   mach->Processor = parse.FullHeader.Processor.Processor;
662   mach->ImmLimit = 0;
663
664   declarations = (struct tgsi_full_declaration *)
665      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
666
667   if (!declarations) {
668      return;
669   }
670
671   instructions = (struct tgsi_full_instruction *)
672      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
673
674   if (!instructions) {
675      FREE( declarations );
676      return;
677   }
678
679   while( !tgsi_parse_end_of_tokens( &parse ) ) {
680      uint i;
681
682      tgsi_parse_token( &parse );
683      switch( parse.FullToken.Token.Type ) {
684      case TGSI_TOKEN_TYPE_DECLARATION:
685         /* save expanded declaration */
686         if (numDeclarations == maxDeclarations) {
687            declarations = REALLOC(declarations,
688                                   maxDeclarations
689                                   * sizeof(struct tgsi_full_declaration),
690                                   (maxDeclarations + 10)
691                                   * sizeof(struct tgsi_full_declaration));
692            maxDeclarations += 10;
693         }
694         if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
695            unsigned reg;
696            for (reg = parse.FullToken.FullDeclaration.Range.First;
697                 reg <= parse.FullToken.FullDeclaration.Range.Last;
698                 ++reg) {
699               ++mach->NumOutputs;
700            }
701         }
702         if (parse.FullToken.FullDeclaration.Declaration.File ==
703             TGSI_FILE_IMMEDIATE_ARRAY) {
704            unsigned reg;
705            struct tgsi_full_declaration *decl =
706               &parse.FullToken.FullDeclaration;
707            debug_assert(decl->Range.Last < TGSI_EXEC_NUM_IMMEDIATES);
708            for (reg = decl->Range.First; reg <= decl->Range.Last; ++reg) {
709               for( i = 0; i < 4; i++ ) {
710                  int idx = reg * 4 + i;
711                  mach->ImmArray[reg][i] = decl->ImmediateData.u[idx].Float;
712               }
713            }
714         }
715         memcpy(declarations + numDeclarations,
716                &parse.FullToken.FullDeclaration,
717                sizeof(declarations[0]));
718         numDeclarations++;
719         break;
720
721      case TGSI_TOKEN_TYPE_IMMEDIATE:
722         {
723            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
724            assert( size <= 4 );
725            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
726
727            for( i = 0; i < size; i++ ) {
728               mach->Imms[mach->ImmLimit][i] =
729		  parse.FullToken.FullImmediate.u[i].Float;
730            }
731            mach->ImmLimit += 1;
732         }
733         break;
734
735      case TGSI_TOKEN_TYPE_INSTRUCTION:
736
737         /* save expanded instruction */
738         if (numInstructions == maxInstructions) {
739            instructions = REALLOC(instructions,
740                                   maxInstructions
741                                   * sizeof(struct tgsi_full_instruction),
742                                   (maxInstructions + 10)
743                                   * sizeof(struct tgsi_full_instruction));
744            maxInstructions += 10;
745         }
746
747         memcpy(instructions + numInstructions,
748                &parse.FullToken.FullInstruction,
749                sizeof(instructions[0]));
750
751         numInstructions++;
752         break;
753
754      case TGSI_TOKEN_TYPE_PROPERTY:
755         break;
756
757      default:
758         assert( 0 );
759      }
760   }
761   tgsi_parse_free (&parse);
762
763   if (mach->Declarations) {
764      FREE( mach->Declarations );
765   }
766   mach->Declarations = declarations;
767   mach->NumDeclarations = numDeclarations;
768
769   if (mach->Instructions) {
770      FREE( mach->Instructions );
771   }
772   mach->Instructions = instructions;
773   mach->NumInstructions = numInstructions;
774}
775
776
777struct tgsi_exec_machine *
778tgsi_exec_machine_create( void )
779{
780   struct tgsi_exec_machine *mach;
781   uint i;
782
783   mach = align_malloc( sizeof *mach, 16 );
784   if (!mach)
785      goto fail;
786
787   memset(mach, 0, sizeof(*mach));
788
789   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
790   mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
791   mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
792
793   /* Setup constants. */
794   for( i = 0; i < 4; i++ ) {
795      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
796      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
797      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
798      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
799      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
800      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
801      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
802      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
803      mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
804      mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
805   }
806
807#ifdef DEBUG
808   /* silence warnings */
809   (void) print_chan;
810   (void) print_temp;
811#endif
812
813   return mach;
814
815fail:
816   align_free(mach);
817   return NULL;
818}
819
820
821void
822tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
823{
824   if (mach) {
825      if (mach->Instructions)
826         FREE(mach->Instructions);
827      if (mach->Declarations)
828      FREE(mach->Declarations);
829   }
830
831   align_free(mach);
832}
833
834static void
835micro_add(union tgsi_exec_channel *dst,
836          const union tgsi_exec_channel *src0,
837          const union tgsi_exec_channel *src1)
838{
839   dst->f[0] = src0->f[0] + src1->f[0];
840   dst->f[1] = src0->f[1] + src1->f[1];
841   dst->f[2] = src0->f[2] + src1->f[2];
842   dst->f[3] = src0->f[3] + src1->f[3];
843}
844
845static void
846micro_div(
847   union tgsi_exec_channel *dst,
848   const union tgsi_exec_channel *src0,
849   const union tgsi_exec_channel *src1 )
850{
851   if (src1->f[0] != 0) {
852      dst->f[0] = src0->f[0] / src1->f[0];
853   }
854   if (src1->f[1] != 0) {
855      dst->f[1] = src0->f[1] / src1->f[1];
856   }
857   if (src1->f[2] != 0) {
858      dst->f[2] = src0->f[2] / src1->f[2];
859   }
860   if (src1->f[3] != 0) {
861      dst->f[3] = src0->f[3] / src1->f[3];
862   }
863}
864
865static void
866micro_float_clamp(union tgsi_exec_channel *dst,
867                  const union tgsi_exec_channel *src)
868{
869   uint i;
870
871   for (i = 0; i < 4; i++) {
872      if (src->f[i] > 0.0f) {
873         if (src->f[i] > 1.884467e+019f)
874            dst->f[i] = 1.884467e+019f;
875         else if (src->f[i] < 5.42101e-020f)
876            dst->f[i] = 5.42101e-020f;
877         else
878            dst->f[i] = src->f[i];
879      }
880      else {
881         if (src->f[i] < -1.884467e+019f)
882            dst->f[i] = -1.884467e+019f;
883         else if (src->f[i] > -5.42101e-020f)
884            dst->f[i] = -5.42101e-020f;
885         else
886            dst->f[i] = src->f[i];
887      }
888   }
889}
890
891static void
892micro_lt(
893   union tgsi_exec_channel *dst,
894   const union tgsi_exec_channel *src0,
895   const union tgsi_exec_channel *src1,
896   const union tgsi_exec_channel *src2,
897   const union tgsi_exec_channel *src3 )
898{
899   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
900   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
901   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
902   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
903}
904
905static void
906micro_max(union tgsi_exec_channel *dst,
907          const union tgsi_exec_channel *src0,
908          const union tgsi_exec_channel *src1)
909{
910   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
911   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
912   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
913   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
914}
915
916static void
917micro_min(union tgsi_exec_channel *dst,
918          const union tgsi_exec_channel *src0,
919          const union tgsi_exec_channel *src1)
920{
921   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
922   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
923   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
924   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
925}
926
927static void
928micro_mul(union tgsi_exec_channel *dst,
929          const union tgsi_exec_channel *src0,
930          const union tgsi_exec_channel *src1)
931{
932   dst->f[0] = src0->f[0] * src1->f[0];
933   dst->f[1] = src0->f[1] * src1->f[1];
934   dst->f[2] = src0->f[2] * src1->f[2];
935   dst->f[3] = src0->f[3] * src1->f[3];
936}
937
938#if 0
939static void
940micro_imul64(
941   union tgsi_exec_channel *dst0,
942   union tgsi_exec_channel *dst1,
943   const union tgsi_exec_channel *src0,
944   const union tgsi_exec_channel *src1 )
945{
946   dst1->i[0] = src0->i[0] * src1->i[0];
947   dst1->i[1] = src0->i[1] * src1->i[1];
948   dst1->i[2] = src0->i[2] * src1->i[2];
949   dst1->i[3] = src0->i[3] * src1->i[3];
950   dst0->i[0] = 0;
951   dst0->i[1] = 0;
952   dst0->i[2] = 0;
953   dst0->i[3] = 0;
954}
955#endif
956
957#if 0
958static void
959micro_umul64(
960   union tgsi_exec_channel *dst0,
961   union tgsi_exec_channel *dst1,
962   const union tgsi_exec_channel *src0,
963   const union tgsi_exec_channel *src1 )
964{
965   dst1->u[0] = src0->u[0] * src1->u[0];
966   dst1->u[1] = src0->u[1] * src1->u[1];
967   dst1->u[2] = src0->u[2] * src1->u[2];
968   dst1->u[3] = src0->u[3] * src1->u[3];
969   dst0->u[0] = 0;
970   dst0->u[1] = 0;
971   dst0->u[2] = 0;
972   dst0->u[3] = 0;
973}
974#endif
975
976
977#if 0
978static void
979micro_movc(
980   union tgsi_exec_channel *dst,
981   const union tgsi_exec_channel *src0,
982   const union tgsi_exec_channel *src1,
983   const union tgsi_exec_channel *src2 )
984{
985   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
986   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
987   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
988   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
989}
990#endif
991
992static void
993micro_neg(
994   union tgsi_exec_channel *dst,
995   const union tgsi_exec_channel *src )
996{
997   dst->f[0] = -src->f[0];
998   dst->f[1] = -src->f[1];
999   dst->f[2] = -src->f[2];
1000   dst->f[3] = -src->f[3];
1001}
1002
1003static void
1004micro_pow(
1005   union tgsi_exec_channel *dst,
1006   const union tgsi_exec_channel *src0,
1007   const union tgsi_exec_channel *src1 )
1008{
1009#if FAST_MATH
1010   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
1011   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
1012   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
1013   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
1014#else
1015   dst->f[0] = powf( src0->f[0], src1->f[0] );
1016   dst->f[1] = powf( src0->f[1], src1->f[1] );
1017   dst->f[2] = powf( src0->f[2], src1->f[2] );
1018   dst->f[3] = powf( src0->f[3], src1->f[3] );
1019#endif
1020}
1021
1022static void
1023micro_sub(union tgsi_exec_channel *dst,
1024          const union tgsi_exec_channel *src0,
1025          const union tgsi_exec_channel *src1)
1026{
1027   dst->f[0] = src0->f[0] - src1->f[0];
1028   dst->f[1] = src0->f[1] - src1->f[1];
1029   dst->f[2] = src0->f[2] - src1->f[2];
1030   dst->f[3] = src0->f[3] - src1->f[3];
1031}
1032
1033static void
1034fetch_src_file_channel(const struct tgsi_exec_machine *mach,
1035                       const uint file,
1036                       const uint swizzle,
1037                       const union tgsi_exec_channel *index,
1038                       const union tgsi_exec_channel *index2D,
1039                       union tgsi_exec_channel *chan)
1040{
1041   uint i;
1042
1043   switch (file) {
1044   case TGSI_FILE_CONSTANT:
1045      for (i = 0; i < QUAD_SIZE; i++) {
1046         assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
1047         assert(mach->Consts[index2D->i[i]]);
1048
1049         if (index->i[i] < 0) {
1050            chan->u[i] = 0;
1051         } else {
1052            const uint *p = (const uint *)mach->Consts[index2D->i[i]];
1053
1054            chan->u[i] = p[index->i[i] * 4 + swizzle];
1055         }
1056      }
1057      break;
1058
1059   case TGSI_FILE_INPUT:
1060   case TGSI_FILE_SYSTEM_VALUE:
1061      for (i = 0; i < QUAD_SIZE; i++) {
1062         /*
1063         if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1064            debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
1065                         index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
1066                         index2D->i[i], index->i[i]);
1067                         }*/
1068         chan->u[i] = mach->Inputs[index2D->i[i] *
1069                                   TGSI_EXEC_MAX_INPUT_ATTRIBS +
1070                                   index->i[i]].xyzw[swizzle].u[i];
1071      }
1072      break;
1073
1074   case TGSI_FILE_TEMPORARY:
1075      for (i = 0; i < QUAD_SIZE; i++) {
1076         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1077         assert(index2D->i[i] == 0);
1078
1079         chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1080      }
1081      break;
1082
1083   case TGSI_FILE_TEMPORARY_ARRAY:
1084      for (i = 0; i < QUAD_SIZE; i++) {
1085         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1086         assert(index2D->i[i] < TGSI_EXEC_NUM_TEMP_ARRAYS);
1087
1088         chan->u[i] =
1089            mach->TempArray[index2D->i[i]][index->i[i]].xyzw[swizzle].u[i];
1090      }
1091      break;
1092
1093   case TGSI_FILE_IMMEDIATE:
1094      for (i = 0; i < QUAD_SIZE; i++) {
1095         assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1096         assert(index2D->i[i] == 0);
1097
1098         chan->f[i] = mach->Imms[index->i[i]][swizzle];
1099      }
1100      break;
1101
1102   case TGSI_FILE_IMMEDIATE_ARRAY:
1103      for (i = 0; i < QUAD_SIZE; i++) {
1104         assert(index2D->i[i] == 0);
1105
1106         chan->f[i] = mach->ImmArray[index->i[i]][swizzle];
1107      }
1108      break;
1109
1110   case TGSI_FILE_ADDRESS:
1111      for (i = 0; i < QUAD_SIZE; i++) {
1112         assert(index->i[i] >= 0);
1113         assert(index2D->i[i] == 0);
1114
1115         chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1116      }
1117      break;
1118
1119   case TGSI_FILE_PREDICATE:
1120      for (i = 0; i < QUAD_SIZE; i++) {
1121         assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
1122         assert(index2D->i[i] == 0);
1123
1124         chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
1125      }
1126      break;
1127
1128   case TGSI_FILE_OUTPUT:
1129      /* vertex/fragment output vars can be read too */
1130      for (i = 0; i < QUAD_SIZE; i++) {
1131         assert(index->i[i] >= 0);
1132         assert(index2D->i[i] == 0);
1133
1134         chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1135      }
1136      break;
1137
1138   default:
1139      assert(0);
1140      for (i = 0; i < QUAD_SIZE; i++) {
1141         chan->u[i] = 0;
1142      }
1143   }
1144}
1145
1146static void
1147fetch_source(const struct tgsi_exec_machine *mach,
1148             union tgsi_exec_channel *chan,
1149             const struct tgsi_full_src_register *reg,
1150             const uint chan_index,
1151             enum tgsi_exec_datatype src_datatype)
1152{
1153   union tgsi_exec_channel index;
1154   union tgsi_exec_channel index2D;
1155   uint swizzle;
1156
1157   /* We start with a direct index into a register file.
1158    *
1159    *    file[1],
1160    *    where:
1161    *       file = Register.File
1162    *       [1] = Register.Index
1163    */
1164   index.i[0] =
1165   index.i[1] =
1166   index.i[2] =
1167   index.i[3] = reg->Register.Index;
1168
1169   /* There is an extra source register that indirectly subscripts
1170    * a register file. The direct index now becomes an offset
1171    * that is being added to the indirect register.
1172    *
1173    *    file[ind[2].x+1],
1174    *    where:
1175    *       ind = Indirect.File
1176    *       [2] = Indirect.Index
1177    *       .x = Indirect.SwizzleX
1178    */
1179   if (reg->Register.Indirect) {
1180      union tgsi_exec_channel index2;
1181      union tgsi_exec_channel indir_index;
1182      const uint execmask = mach->ExecMask;
1183      uint i;
1184
1185      /* which address register (always zero now) */
1186      index2.i[0] =
1187      index2.i[1] =
1188      index2.i[2] =
1189      index2.i[3] = reg->Indirect.Index;
1190
1191      /* get current value of address register[swizzle] */
1192      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1193      fetch_src_file_channel(mach,
1194                             reg->Indirect.File,
1195                             swizzle,
1196                             &index2,
1197                             &ZeroVec,
1198                             &indir_index);
1199
1200      /* add value of address register to the offset */
1201      index.i[0] += indir_index.i[0];
1202      index.i[1] += indir_index.i[1];
1203      index.i[2] += indir_index.i[2];
1204      index.i[3] += indir_index.i[3];
1205
1206      /* for disabled execution channels, zero-out the index to
1207       * avoid using a potential garbage value.
1208       */
1209      for (i = 0; i < QUAD_SIZE; i++) {
1210         if ((execmask & (1 << i)) == 0)
1211            index.i[i] = 0;
1212      }
1213   }
1214
1215   /* There is an extra source register that is a second
1216    * subscript to a register file. Effectively it means that
1217    * the register file is actually a 2D array of registers.
1218    *
1219    *    file[3][1],
1220    *    where:
1221    *       [3] = Dimension.Index
1222    */
1223   if (reg->Register.Dimension) {
1224      index2D.i[0] =
1225      index2D.i[1] =
1226      index2D.i[2] =
1227      index2D.i[3] = reg->Dimension.Index;
1228
1229      /* Again, the second subscript index can be addressed indirectly
1230       * identically to the first one.
1231       * Nothing stops us from indirectly addressing the indirect register,
1232       * but there is no need for that, so we won't exercise it.
1233       *
1234       *    file[ind[4].y+3][1],
1235       *    where:
1236       *       ind = DimIndirect.File
1237       *       [4] = DimIndirect.Index
1238       *       .y = DimIndirect.SwizzleX
1239       */
1240      if (reg->Dimension.Indirect) {
1241         union tgsi_exec_channel index2;
1242         union tgsi_exec_channel indir_index;
1243         const uint execmask = mach->ExecMask;
1244         uint i;
1245
1246         index2.i[0] =
1247         index2.i[1] =
1248         index2.i[2] =
1249         index2.i[3] = reg->DimIndirect.Index;
1250
1251         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1252         fetch_src_file_channel(mach,
1253                                reg->DimIndirect.File,
1254                                swizzle,
1255                                &index2,
1256                                &ZeroVec,
1257                                &indir_index);
1258
1259         index2D.i[0] += indir_index.i[0];
1260         index2D.i[1] += indir_index.i[1];
1261         index2D.i[2] += indir_index.i[2];
1262         index2D.i[3] += indir_index.i[3];
1263
1264         /* for disabled execution channels, zero-out the index to
1265          * avoid using a potential garbage value.
1266          */
1267         for (i = 0; i < QUAD_SIZE; i++) {
1268            if ((execmask & (1 << i)) == 0) {
1269               index2D.i[i] = 0;
1270            }
1271         }
1272      }
1273
1274      /* If by any chance there was a need for a 3D array of register
1275       * files, we would have to check whether Dimension is followed
1276       * by a dimension register and continue the saga.
1277       */
1278   } else {
1279      index2D.i[0] =
1280      index2D.i[1] =
1281      index2D.i[2] =
1282      index2D.i[3] = 0;
1283   }
1284
1285   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1286   fetch_src_file_channel(mach,
1287                          reg->Register.File,
1288                          swizzle,
1289                          &index,
1290                          &index2D,
1291                          chan);
1292
1293   if (reg->Register.Absolute) {
1294      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1295         micro_abs(chan, chan);
1296      } else {
1297         micro_iabs(chan, chan);
1298      }
1299   }
1300
1301   if (reg->Register.Negate) {
1302      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1303         micro_neg(chan, chan);
1304      } else {
1305         micro_ineg(chan, chan);
1306      }
1307   }
1308}
1309
1310static void
1311store_dest(struct tgsi_exec_machine *mach,
1312           const union tgsi_exec_channel *chan,
1313           const struct tgsi_full_dst_register *reg,
1314           const struct tgsi_full_instruction *inst,
1315           uint chan_index,
1316           enum tgsi_exec_datatype dst_datatype)
1317{
1318   uint i;
1319   union tgsi_exec_channel null;
1320   union tgsi_exec_channel *dst;
1321   union tgsi_exec_channel index2D;
1322   uint execmask = mach->ExecMask;
1323   int offset = 0;  /* indirection offset */
1324   int index;
1325
1326   /* for debugging */
1327   if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1328      check_inf_or_nan(chan);
1329   }
1330
1331   /* There is an extra source register that indirectly subscripts
1332    * a register file. The direct index now becomes an offset
1333    * that is being added to the indirect register.
1334    *
1335    *    file[ind[2].x+1],
1336    *    where:
1337    *       ind = Indirect.File
1338    *       [2] = Indirect.Index
1339    *       .x = Indirect.SwizzleX
1340    */
1341   if (reg->Register.Indirect) {
1342      union tgsi_exec_channel index;
1343      union tgsi_exec_channel indir_index;
1344      uint swizzle;
1345
1346      /* which address register (always zero for now) */
1347      index.i[0] =
1348      index.i[1] =
1349      index.i[2] =
1350      index.i[3] = reg->Indirect.Index;
1351
1352      /* get current value of address register[swizzle] */
1353      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1354
1355      /* fetch values from the address/indirection register */
1356      fetch_src_file_channel(mach,
1357                             reg->Indirect.File,
1358                             swizzle,
1359                             &index,
1360                             &ZeroVec,
1361                             &indir_index);
1362
1363      /* save indirection offset */
1364      offset = indir_index.i[0];
1365   }
1366
1367   /* There is an extra source register that is a second
1368    * subscript to a register file. Effectively it means that
1369    * the register file is actually a 2D array of registers.
1370    *
1371    *    file[3][1],
1372    *    where:
1373    *       [3] = Dimension.Index
1374    */
1375   if (reg->Register.Dimension) {
1376      index2D.i[0] =
1377      index2D.i[1] =
1378      index2D.i[2] =
1379      index2D.i[3] = reg->Dimension.Index;
1380
1381      /* Again, the second subscript index can be addressed indirectly
1382       * identically to the first one.
1383       * Nothing stops us from indirectly addressing the indirect register,
1384       * but there is no need for that, so we won't exercise it.
1385       *
1386       *    file[ind[4].y+3][1],
1387       *    where:
1388       *       ind = DimIndirect.File
1389       *       [4] = DimIndirect.Index
1390       *       .y = DimIndirect.SwizzleX
1391       */
1392      if (reg->Dimension.Indirect) {
1393         union tgsi_exec_channel index2;
1394         union tgsi_exec_channel indir_index;
1395         const uint execmask = mach->ExecMask;
1396         unsigned swizzle;
1397         uint i;
1398
1399         index2.i[0] =
1400         index2.i[1] =
1401         index2.i[2] =
1402         index2.i[3] = reg->DimIndirect.Index;
1403
1404         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1405         fetch_src_file_channel(mach,
1406                                reg->DimIndirect.File,
1407                                swizzle,
1408                                &index2,
1409                                &ZeroVec,
1410                                &indir_index);
1411
1412         index2D.i[0] += indir_index.i[0];
1413         index2D.i[1] += indir_index.i[1];
1414         index2D.i[2] += indir_index.i[2];
1415         index2D.i[3] += indir_index.i[3];
1416
1417         /* for disabled execution channels, zero-out the index to
1418          * avoid using a potential garbage value.
1419          */
1420         for (i = 0; i < QUAD_SIZE; i++) {
1421            if ((execmask & (1 << i)) == 0) {
1422               index2D.i[i] = 0;
1423            }
1424         }
1425      }
1426
1427      /* If by any chance there was a need for a 3D array of register
1428       * files, we would have to check whether Dimension is followed
1429       * by a dimension register and continue the saga.
1430       */
1431   } else {
1432      index2D.i[0] =
1433      index2D.i[1] =
1434      index2D.i[2] =
1435      index2D.i[3] = 0;
1436   }
1437
1438   switch (reg->Register.File) {
1439   case TGSI_FILE_NULL:
1440      dst = &null;
1441      break;
1442
1443   case TGSI_FILE_OUTPUT:
1444      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1445         + reg->Register.Index;
1446      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1447#if 0
1448      if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1449         fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1450         for (i = 0; i < QUAD_SIZE; i++)
1451            if (execmask & (1 << i))
1452               fprintf(stderr, "%f, ", chan->f[i]);
1453         fprintf(stderr, ")\n");
1454      }
1455#endif
1456      break;
1457
1458   case TGSI_FILE_TEMPORARY:
1459      index = reg->Register.Index;
1460      assert( index < TGSI_EXEC_NUM_TEMPS );
1461      dst = &mach->Temps[offset + index].xyzw[chan_index];
1462      break;
1463
1464   case TGSI_FILE_TEMPORARY_ARRAY:
1465      index = reg->Register.Index;
1466      assert( index < TGSI_EXEC_NUM_TEMPS );
1467      assert( index2D.i[0] < TGSI_EXEC_NUM_TEMP_ARRAYS );
1468      /* XXX we use index2D.i[0] here but somehow we might
1469       * end up with someone trying to store indirectly in
1470       * different buffers */
1471      dst = &mach->TempArray[index2D.i[0]][offset + index].xyzw[chan_index];
1472      break;
1473
1474   case TGSI_FILE_ADDRESS:
1475      index = reg->Register.Index;
1476      dst = &mach->Addrs[index].xyzw[chan_index];
1477      break;
1478
1479   case TGSI_FILE_PREDICATE:
1480      index = reg->Register.Index;
1481      assert(index < TGSI_EXEC_NUM_PREDS);
1482      dst = &mach->Predicates[index].xyzw[chan_index];
1483      break;
1484
1485   default:
1486      assert( 0 );
1487      return;
1488   }
1489
1490   if (inst->Instruction.Predicate) {
1491      uint swizzle;
1492      union tgsi_exec_channel *pred;
1493
1494      switch (chan_index) {
1495      case CHAN_X:
1496         swizzle = inst->Predicate.SwizzleX;
1497         break;
1498      case CHAN_Y:
1499         swizzle = inst->Predicate.SwizzleY;
1500         break;
1501      case CHAN_Z:
1502         swizzle = inst->Predicate.SwizzleZ;
1503         break;
1504      case CHAN_W:
1505         swizzle = inst->Predicate.SwizzleW;
1506         break;
1507      default:
1508         assert(0);
1509         return;
1510      }
1511
1512      assert(inst->Predicate.Index == 0);
1513
1514      pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1515
1516      if (inst->Predicate.Negate) {
1517         for (i = 0; i < QUAD_SIZE; i++) {
1518            if (pred->u[i]) {
1519               execmask &= ~(1 << i);
1520            }
1521         }
1522      } else {
1523         for (i = 0; i < QUAD_SIZE; i++) {
1524            if (!pred->u[i]) {
1525               execmask &= ~(1 << i);
1526            }
1527         }
1528      }
1529   }
1530
1531   switch (inst->Instruction.Saturate) {
1532   case TGSI_SAT_NONE:
1533      for (i = 0; i < QUAD_SIZE; i++)
1534         if (execmask & (1 << i))
1535            dst->i[i] = chan->i[i];
1536      break;
1537
1538   case TGSI_SAT_ZERO_ONE:
1539      for (i = 0; i < QUAD_SIZE; i++)
1540         if (execmask & (1 << i)) {
1541            if (chan->f[i] < 0.0f)
1542               dst->f[i] = 0.0f;
1543            else if (chan->f[i] > 1.0f)
1544               dst->f[i] = 1.0f;
1545            else
1546               dst->i[i] = chan->i[i];
1547         }
1548      break;
1549
1550   case TGSI_SAT_MINUS_PLUS_ONE:
1551      for (i = 0; i < QUAD_SIZE; i++)
1552         if (execmask & (1 << i)) {
1553            if (chan->f[i] < -1.0f)
1554               dst->f[i] = -1.0f;
1555            else if (chan->f[i] > 1.0f)
1556               dst->f[i] = 1.0f;
1557            else
1558               dst->i[i] = chan->i[i];
1559         }
1560      break;
1561
1562   default:
1563      assert( 0 );
1564   }
1565}
1566
1567#define FETCH(VAL,INDEX,CHAN)\
1568    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1569
1570#define STORE(VAL,INDEX,CHAN)\
1571   store_dest(mach, VAL, &inst->Dst[INDEX], inst, CHAN, TGSI_EXEC_DATA_FLOAT)
1572
1573
1574/**
1575 * Execute ARB-style KIL which is predicated by a src register.
1576 * Kill fragment if any of the four values is less than zero.
1577 */
1578static void
1579exec_kil(struct tgsi_exec_machine *mach,
1580         const struct tgsi_full_instruction *inst)
1581{
1582   uint uniquemask;
1583   uint chan_index;
1584   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1585   union tgsi_exec_channel r[1];
1586
1587   /* This mask stores component bits that were already tested. */
1588   uniquemask = 0;
1589
1590   for (chan_index = 0; chan_index < 4; chan_index++)
1591   {
1592      uint swizzle;
1593      uint i;
1594
1595      /* unswizzle channel */
1596      swizzle = tgsi_util_get_full_src_register_swizzle (
1597                        &inst->Src[0],
1598                        chan_index);
1599
1600      /* check if the component has not been already tested */
1601      if (uniquemask & (1 << swizzle))
1602         continue;
1603      uniquemask |= 1 << swizzle;
1604
1605      FETCH(&r[0], 0, chan_index);
1606      for (i = 0; i < 4; i++)
1607         if (r[0].f[i] < 0.0f)
1608            kilmask |= 1 << i;
1609   }
1610
1611   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1612}
1613
1614/**
1615 * Execute NVIDIA-style KIL which is predicated by a condition code.
1616 * Kill fragment if the condition code is TRUE.
1617 */
1618static void
1619exec_kilp(struct tgsi_exec_machine *mach,
1620          const struct tgsi_full_instruction *inst)
1621{
1622   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1623
1624   /* "unconditional" kil */
1625   kilmask = mach->ExecMask;
1626   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1627}
1628
1629static void
1630emit_vertex(struct tgsi_exec_machine *mach)
1631{
1632   /* FIXME: check for exec mask correctly
1633   unsigned i;
1634   for (i = 0; i < QUAD_SIZE; ++i) {
1635         if ((mach->ExecMask & (1 << i)))
1636   */
1637   if (mach->ExecMask) {
1638      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1639      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1640   }
1641}
1642
1643static void
1644emit_primitive(struct tgsi_exec_machine *mach)
1645{
1646   unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1647   /* FIXME: check for exec mask correctly
1648   unsigned i;
1649   for (i = 0; i < QUAD_SIZE; ++i) {
1650         if ((mach->ExecMask & (1 << i)))
1651   */
1652   if (mach->ExecMask) {
1653      ++(*prim_count);
1654      debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1655      mach->Primitives[*prim_count] = 0;
1656   }
1657}
1658
1659static void
1660conditional_emit_primitive(struct tgsi_exec_machine *mach)
1661{
1662   if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1663      int emitted_verts =
1664         mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]];
1665      if (emitted_verts) {
1666         emit_primitive(mach);
1667      }
1668   }
1669}
1670
1671
1672/*
1673 * Fetch four texture samples using STR texture coordinates.
1674 */
1675static void
1676fetch_texel( struct tgsi_sampler *sampler,
1677             const union tgsi_exec_channel *s,
1678             const union tgsi_exec_channel *t,
1679             const union tgsi_exec_channel *p,
1680             const union tgsi_exec_channel *c0,
1681             enum tgsi_sampler_control control,
1682             union tgsi_exec_channel *r,
1683             union tgsi_exec_channel *g,
1684             union tgsi_exec_channel *b,
1685             union tgsi_exec_channel *a )
1686{
1687   uint j;
1688   float rgba[NUM_CHANNELS][QUAD_SIZE];
1689
1690   sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
1691
1692   for (j = 0; j < 4; j++) {
1693      r->f[j] = rgba[0][j];
1694      g->f[j] = rgba[1][j];
1695      b->f[j] = rgba[2][j];
1696      a->f[j] = rgba[3][j];
1697   }
1698}
1699
1700
1701#define TEX_MODIFIER_NONE           0
1702#define TEX_MODIFIER_PROJECTED      1
1703#define TEX_MODIFIER_LOD_BIAS       2
1704#define TEX_MODIFIER_EXPLICIT_LOD   3
1705
1706
1707static void
1708exec_tex(struct tgsi_exec_machine *mach,
1709         const struct tgsi_full_instruction *inst,
1710         uint modifier)
1711{
1712   const uint unit = inst->Src[1].Register.Index;
1713   union tgsi_exec_channel r[4];
1714   const union tgsi_exec_channel *lod = &ZeroVec;
1715   enum tgsi_sampler_control control;
1716   uint chan_index;
1717
1718   if (modifier != TEX_MODIFIER_NONE) {
1719      FETCH(&r[3], 0, CHAN_W);
1720      if (modifier != TEX_MODIFIER_PROJECTED) {
1721         lod = &r[3];
1722      }
1723   }
1724
1725   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
1726      control = tgsi_sampler_lod_explicit;
1727   } else {
1728      control = tgsi_sampler_lod_bias;
1729   }
1730
1731   switch (inst->Texture.Texture) {
1732   case TGSI_TEXTURE_1D:
1733   case TGSI_TEXTURE_SHADOW1D:
1734      FETCH(&r[0], 0, CHAN_X);
1735
1736      if (modifier == TEX_MODIFIER_PROJECTED) {
1737         micro_div(&r[0], &r[0], &r[3]);
1738      }
1739
1740      fetch_texel(mach->Samplers[unit],
1741                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
1742                  control,
1743                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1744      break;
1745
1746   case TGSI_TEXTURE_2D:
1747   case TGSI_TEXTURE_RECT:
1748   case TGSI_TEXTURE_SHADOW2D:
1749   case TGSI_TEXTURE_SHADOWRECT:
1750      FETCH(&r[0], 0, CHAN_X);
1751      FETCH(&r[1], 0, CHAN_Y);
1752      FETCH(&r[2], 0, CHAN_Z);
1753
1754      if (modifier == TEX_MODIFIER_PROJECTED) {
1755         micro_div(&r[0], &r[0], &r[3]);
1756         micro_div(&r[1], &r[1], &r[3]);
1757         micro_div(&r[2], &r[2], &r[3]);
1758      }
1759
1760      fetch_texel(mach->Samplers[unit],
1761                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1762                  control,
1763                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1764      break;
1765
1766   case TGSI_TEXTURE_3D:
1767   case TGSI_TEXTURE_CUBE:
1768      FETCH(&r[0], 0, CHAN_X);
1769      FETCH(&r[1], 0, CHAN_Y);
1770      FETCH(&r[2], 0, CHAN_Z);
1771
1772      if (modifier == TEX_MODIFIER_PROJECTED) {
1773         micro_div(&r[0], &r[0], &r[3]);
1774         micro_div(&r[1], &r[1], &r[3]);
1775         micro_div(&r[2], &r[2], &r[3]);
1776      }
1777
1778      fetch_texel(mach->Samplers[unit],
1779                  &r[0], &r[1], &r[2], lod,
1780                  control,
1781                  &r[0], &r[1], &r[2], &r[3]);
1782      break;
1783
1784   default:
1785      assert(0);
1786   }
1787
1788   FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1789      STORE(&r[chan_index], 0, chan_index);
1790   }
1791}
1792
1793static void
1794exec_txd(struct tgsi_exec_machine *mach,
1795         const struct tgsi_full_instruction *inst)
1796{
1797   const uint unit = inst->Src[3].Register.Index;
1798   union tgsi_exec_channel r[4];
1799   uint chan_index;
1800
1801   /*
1802    * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1803    */
1804
1805   switch (inst->Texture.Texture) {
1806   case TGSI_TEXTURE_1D:
1807   case TGSI_TEXTURE_SHADOW1D:
1808
1809      FETCH(&r[0], 0, CHAN_X);
1810
1811      fetch_texel(mach->Samplers[unit],
1812                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
1813                  tgsi_sampler_lod_bias,
1814                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
1815      break;
1816
1817   case TGSI_TEXTURE_2D:
1818   case TGSI_TEXTURE_RECT:
1819   case TGSI_TEXTURE_SHADOW2D:
1820   case TGSI_TEXTURE_SHADOWRECT:
1821
1822      FETCH(&r[0], 0, CHAN_X);
1823      FETCH(&r[1], 0, CHAN_Y);
1824      FETCH(&r[2], 0, CHAN_Z);
1825
1826      fetch_texel(mach->Samplers[unit],
1827                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
1828                  tgsi_sampler_lod_bias,
1829                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
1830      break;
1831
1832   case TGSI_TEXTURE_3D:
1833   case TGSI_TEXTURE_CUBE:
1834
1835      FETCH(&r[0], 0, CHAN_X);
1836      FETCH(&r[1], 0, CHAN_Y);
1837      FETCH(&r[2], 0, CHAN_Z);
1838
1839      fetch_texel(mach->Samplers[unit],
1840                  &r[0], &r[1], &r[2], &ZeroVec,
1841                  tgsi_sampler_lod_bias,
1842                  &r[0], &r[1], &r[2], &r[3]);
1843      break;
1844
1845   default:
1846      assert(0);
1847   }
1848
1849   FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1850      STORE(&r[chan_index], 0, chan_index);
1851   }
1852}
1853
1854
1855/**
1856 * Evaluate a constant-valued coefficient at the position of the
1857 * current quad.
1858 */
1859static void
1860eval_constant_coef(
1861   struct tgsi_exec_machine *mach,
1862   unsigned attrib,
1863   unsigned chan )
1864{
1865   unsigned i;
1866
1867   for( i = 0; i < QUAD_SIZE; i++ ) {
1868      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1869   }
1870}
1871
1872/**
1873 * Evaluate a linear-valued coefficient at the position of the
1874 * current quad.
1875 */
1876static void
1877eval_linear_coef(
1878   struct tgsi_exec_machine *mach,
1879   unsigned attrib,
1880   unsigned chan )
1881{
1882   const float x = mach->QuadPos.xyzw[0].f[0];
1883   const float y = mach->QuadPos.xyzw[1].f[0];
1884   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1885   const float dady = mach->InterpCoefs[attrib].dady[chan];
1886   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1887   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1888   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1889   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1890   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1891}
1892
1893/**
1894 * Evaluate a perspective-valued coefficient at the position of the
1895 * current quad.
1896 */
1897static void
1898eval_perspective_coef(
1899   struct tgsi_exec_machine *mach,
1900   unsigned attrib,
1901   unsigned chan )
1902{
1903   const float x = mach->QuadPos.xyzw[0].f[0];
1904   const float y = mach->QuadPos.xyzw[1].f[0];
1905   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1906   const float dady = mach->InterpCoefs[attrib].dady[chan];
1907   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1908   const float *w = mach->QuadPos.xyzw[3].f;
1909   /* divide by W here */
1910   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1911   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1912   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1913   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1914}
1915
1916
1917typedef void (* eval_coef_func)(
1918   struct tgsi_exec_machine *mach,
1919   unsigned attrib,
1920   unsigned chan );
1921
1922static void
1923exec_declaration(struct tgsi_exec_machine *mach,
1924                 const struct tgsi_full_declaration *decl)
1925{
1926   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
1927      if (decl->Declaration.File == TGSI_FILE_INPUT ||
1928          decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1929         uint first, last, mask;
1930
1931         first = decl->Range.First;
1932         last = decl->Range.Last;
1933         mask = decl->Declaration.UsageMask;
1934
1935         /* XXX we could remove this special-case code since
1936          * mach->InterpCoefs[first].a0 should already have the
1937          * front/back-face value.  But we should first update the
1938          * ureg code to emit the right UsageMask value (WRITEMASK_X).
1939          * Then, we could remove the tgsi_exec_machine::Face field.
1940          */
1941         if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
1942            uint i;
1943
1944            assert(decl->Semantic.Index == 0);
1945            assert(first == last);
1946
1947            for (i = 0; i < QUAD_SIZE; i++) {
1948               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
1949            }
1950         } else {
1951            eval_coef_func eval;
1952            uint i, j;
1953
1954            switch (decl->Declaration.Interpolate) {
1955            case TGSI_INTERPOLATE_CONSTANT:
1956               eval = eval_constant_coef;
1957               break;
1958
1959            case TGSI_INTERPOLATE_LINEAR:
1960               eval = eval_linear_coef;
1961               break;
1962
1963            case TGSI_INTERPOLATE_PERSPECTIVE:
1964               eval = eval_perspective_coef;
1965               break;
1966
1967            default:
1968               assert(0);
1969               return;
1970            }
1971
1972            for (j = 0; j < NUM_CHANNELS; j++) {
1973               if (mask & (1 << j)) {
1974                  for (i = first; i <= last; i++) {
1975                     eval(mach, i, j);
1976                  }
1977               }
1978            }
1979         }
1980      }
1981   }
1982}
1983
1984typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
1985                                const union tgsi_exec_channel *src);
1986
1987static void
1988exec_scalar_unary(struct tgsi_exec_machine *mach,
1989                  const struct tgsi_full_instruction *inst,
1990                  micro_unary_op op,
1991                  enum tgsi_exec_datatype dst_datatype,
1992                  enum tgsi_exec_datatype src_datatype)
1993{
1994   unsigned int chan;
1995   union tgsi_exec_channel src;
1996   union tgsi_exec_channel dst;
1997
1998   fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
1999   op(&dst, &src);
2000   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2001      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2002         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2003      }
2004   }
2005}
2006
2007static void
2008exec_vector_unary(struct tgsi_exec_machine *mach,
2009                  const struct tgsi_full_instruction *inst,
2010                  micro_unary_op op,
2011                  enum tgsi_exec_datatype dst_datatype,
2012                  enum tgsi_exec_datatype src_datatype)
2013{
2014   unsigned int chan;
2015   struct tgsi_exec_vector dst;
2016
2017   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2018      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2019         union tgsi_exec_channel src;
2020
2021         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
2022         op(&dst.xyzw[chan], &src);
2023      }
2024   }
2025   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2026      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2027         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2028      }
2029   }
2030}
2031
2032typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
2033                                 const union tgsi_exec_channel *src0,
2034                                 const union tgsi_exec_channel *src1);
2035
2036static void
2037exec_vector_binary(struct tgsi_exec_machine *mach,
2038                   const struct tgsi_full_instruction *inst,
2039                   micro_binary_op op,
2040                   enum tgsi_exec_datatype dst_datatype,
2041                   enum tgsi_exec_datatype src_datatype)
2042{
2043   unsigned int chan;
2044   struct tgsi_exec_vector dst;
2045
2046   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2047      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2048         union tgsi_exec_channel src[2];
2049
2050         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2051         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2052         op(&dst.xyzw[chan], &src[0], &src[1]);
2053      }
2054   }
2055   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2056      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2057         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2058      }
2059   }
2060}
2061
2062typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
2063                                  const union tgsi_exec_channel *src0,
2064                                  const union tgsi_exec_channel *src1,
2065                                  const union tgsi_exec_channel *src2);
2066
2067static void
2068exec_vector_trinary(struct tgsi_exec_machine *mach,
2069                    const struct tgsi_full_instruction *inst,
2070                    micro_trinary_op op,
2071                    enum tgsi_exec_datatype dst_datatype,
2072                    enum tgsi_exec_datatype src_datatype)
2073{
2074   unsigned int chan;
2075   struct tgsi_exec_vector dst;
2076
2077   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2078      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2079         union tgsi_exec_channel src[3];
2080
2081         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2082         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2083         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
2084         op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
2085      }
2086   }
2087   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2088      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2089         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2090      }
2091   }
2092}
2093
2094static void
2095exec_dp3(struct tgsi_exec_machine *mach,
2096         const struct tgsi_full_instruction *inst)
2097{
2098   unsigned int chan;
2099   union tgsi_exec_channel arg[3];
2100
2101   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2102   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2103   micro_mul(&arg[2], &arg[0], &arg[1]);
2104
2105   for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2106      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2107      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2108      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2109   }
2110
2111   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2112      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2113         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2114      }
2115   }
2116}
2117
2118static void
2119exec_dp4(struct tgsi_exec_machine *mach,
2120         const struct tgsi_full_instruction *inst)
2121{
2122   unsigned int chan;
2123   union tgsi_exec_channel arg[3];
2124
2125   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2126   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2127   micro_mul(&arg[2], &arg[0], &arg[1]);
2128
2129   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2130      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2131      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2132      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2133   }
2134
2135   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2136      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2137         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2138      }
2139   }
2140}
2141
2142static void
2143exec_dp2a(struct tgsi_exec_machine *mach,
2144          const struct tgsi_full_instruction *inst)
2145{
2146   unsigned int chan;
2147   union tgsi_exec_channel arg[3];
2148
2149   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2150   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2151   micro_mul(&arg[2], &arg[0], &arg[1]);
2152
2153   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2154   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2155   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2156
2157   fetch_source(mach, &arg[1], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2158   micro_add(&arg[0], &arg[0], &arg[1]);
2159
2160   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2161      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2162         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2163      }
2164   }
2165}
2166
2167static void
2168exec_dph(struct tgsi_exec_machine *mach,
2169         const struct tgsi_full_instruction *inst)
2170{
2171   unsigned int chan;
2172   union tgsi_exec_channel arg[3];
2173
2174   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2175   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2176   micro_mul(&arg[2], &arg[0], &arg[1]);
2177
2178   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2179   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2180   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2181
2182   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2183   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2184   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2185
2186   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2187   micro_add(&arg[0], &arg[0], &arg[1]);
2188
2189   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2190      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2191         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2192      }
2193   }
2194}
2195
2196static void
2197exec_dp2(struct tgsi_exec_machine *mach,
2198         const struct tgsi_full_instruction *inst)
2199{
2200   unsigned int chan;
2201   union tgsi_exec_channel arg[3];
2202
2203   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2204   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2205   micro_mul(&arg[2], &arg[0], &arg[1]);
2206
2207   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2208   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2209   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2210
2211   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2212      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2213         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2214      }
2215   }
2216}
2217
2218static void
2219exec_nrm4(struct tgsi_exec_machine *mach,
2220          const struct tgsi_full_instruction *inst)
2221{
2222   unsigned int chan;
2223   union tgsi_exec_channel arg[4];
2224   union tgsi_exec_channel scale;
2225
2226   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2227   micro_mul(&scale, &arg[0], &arg[0]);
2228
2229   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2230      union tgsi_exec_channel product;
2231
2232      fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2233      micro_mul(&product, &arg[chan], &arg[chan]);
2234      micro_add(&scale, &scale, &product);
2235   }
2236
2237   micro_rsq(&scale, &scale);
2238
2239   for (chan = CHAN_X; chan <= CHAN_W; chan++) {
2240      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2241         micro_mul(&arg[chan], &arg[chan], &scale);
2242         store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2243      }
2244   }
2245}
2246
2247static void
2248exec_nrm3(struct tgsi_exec_machine *mach,
2249          const struct tgsi_full_instruction *inst)
2250{
2251   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2252      unsigned int chan;
2253      union tgsi_exec_channel arg[3];
2254      union tgsi_exec_channel scale;
2255
2256      fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2257      micro_mul(&scale, &arg[0], &arg[0]);
2258
2259      for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2260         union tgsi_exec_channel product;
2261
2262         fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2263         micro_mul(&product, &arg[chan], &arg[chan]);
2264         micro_add(&scale, &scale, &product);
2265      }
2266
2267      micro_rsq(&scale, &scale);
2268
2269      for (chan = CHAN_X; chan <= CHAN_Z; chan++) {
2270         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2271            micro_mul(&arg[chan], &arg[chan], &scale);
2272            store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2273         }
2274      }
2275   }
2276
2277   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2278      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2279   }
2280}
2281
2282static void
2283exec_break(struct tgsi_exec_machine *mach)
2284{
2285   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
2286      /* turn off loop channels for each enabled exec channel */
2287      mach->LoopMask &= ~mach->ExecMask;
2288      /* Todo: if mach->LoopMask == 0, jump to end of loop */
2289      UPDATE_EXEC_MASK(mach);
2290   } else {
2291      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
2292
2293      mach->Switch.mask = 0x0;
2294
2295      UPDATE_EXEC_MASK(mach);
2296   }
2297}
2298
2299static void
2300exec_switch(struct tgsi_exec_machine *mach,
2301            const struct tgsi_full_instruction *inst)
2302{
2303   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
2304   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
2305
2306   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
2307   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2308   mach->Switch.mask = 0x0;
2309   mach->Switch.defaultMask = 0x0;
2310
2311   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
2312   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
2313
2314   UPDATE_EXEC_MASK(mach);
2315}
2316
2317static void
2318exec_case(struct tgsi_exec_machine *mach,
2319          const struct tgsi_full_instruction *inst)
2320{
2321   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2322   union tgsi_exec_channel src;
2323   uint mask = 0;
2324
2325   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2326
2327   if (mach->Switch.selector.u[0] == src.u[0]) {
2328      mask |= 0x1;
2329   }
2330   if (mach->Switch.selector.u[1] == src.u[1]) {
2331      mask |= 0x2;
2332   }
2333   if (mach->Switch.selector.u[2] == src.u[2]) {
2334      mask |= 0x4;
2335   }
2336   if (mach->Switch.selector.u[3] == src.u[3]) {
2337      mask |= 0x8;
2338   }
2339
2340   mach->Switch.defaultMask |= mask;
2341
2342   mach->Switch.mask |= mask & prevMask;
2343
2344   UPDATE_EXEC_MASK(mach);
2345}
2346
2347static void
2348exec_default(struct tgsi_exec_machine *mach)
2349{
2350   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2351
2352   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
2353
2354   UPDATE_EXEC_MASK(mach);
2355}
2356
2357static void
2358exec_endswitch(struct tgsi_exec_machine *mach)
2359{
2360   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
2361   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
2362
2363   UPDATE_EXEC_MASK(mach);
2364}
2365
2366static void
2367micro_i2f(union tgsi_exec_channel *dst,
2368          const union tgsi_exec_channel *src)
2369{
2370   dst->f[0] = (float)src->i[0];
2371   dst->f[1] = (float)src->i[1];
2372   dst->f[2] = (float)src->i[2];
2373   dst->f[3] = (float)src->i[3];
2374}
2375
2376static void
2377micro_not(union tgsi_exec_channel *dst,
2378          const union tgsi_exec_channel *src)
2379{
2380   dst->u[0] = ~src->u[0];
2381   dst->u[1] = ~src->u[1];
2382   dst->u[2] = ~src->u[2];
2383   dst->u[3] = ~src->u[3];
2384}
2385
2386static void
2387micro_shl(union tgsi_exec_channel *dst,
2388          const union tgsi_exec_channel *src0,
2389          const union tgsi_exec_channel *src1)
2390{
2391   dst->u[0] = src0->u[0] << src1->u[0];
2392   dst->u[1] = src0->u[1] << src1->u[1];
2393   dst->u[2] = src0->u[2] << src1->u[2];
2394   dst->u[3] = src0->u[3] << src1->u[3];
2395}
2396
2397static void
2398micro_and(union tgsi_exec_channel *dst,
2399          const union tgsi_exec_channel *src0,
2400          const union tgsi_exec_channel *src1)
2401{
2402   dst->u[0] = src0->u[0] & src1->u[0];
2403   dst->u[1] = src0->u[1] & src1->u[1];
2404   dst->u[2] = src0->u[2] & src1->u[2];
2405   dst->u[3] = src0->u[3] & src1->u[3];
2406}
2407
2408static void
2409micro_or(union tgsi_exec_channel *dst,
2410         const union tgsi_exec_channel *src0,
2411         const union tgsi_exec_channel *src1)
2412{
2413   dst->u[0] = src0->u[0] | src1->u[0];
2414   dst->u[1] = src0->u[1] | src1->u[1];
2415   dst->u[2] = src0->u[2] | src1->u[2];
2416   dst->u[3] = src0->u[3] | src1->u[3];
2417}
2418
2419static void
2420micro_xor(union tgsi_exec_channel *dst,
2421          const union tgsi_exec_channel *src0,
2422          const union tgsi_exec_channel *src1)
2423{
2424   dst->u[0] = src0->u[0] ^ src1->u[0];
2425   dst->u[1] = src0->u[1] ^ src1->u[1];
2426   dst->u[2] = src0->u[2] ^ src1->u[2];
2427   dst->u[3] = src0->u[3] ^ src1->u[3];
2428}
2429
2430static void
2431micro_f2i(union tgsi_exec_channel *dst,
2432          const union tgsi_exec_channel *src)
2433{
2434   dst->i[0] = (int)src->f[0];
2435   dst->i[1] = (int)src->f[1];
2436   dst->i[2] = (int)src->f[2];
2437   dst->i[3] = (int)src->f[3];
2438}
2439
2440static void
2441micro_idiv(union tgsi_exec_channel *dst,
2442           const union tgsi_exec_channel *src0,
2443           const union tgsi_exec_channel *src1)
2444{
2445   dst->i[0] = src0->i[0] / src1->i[0];
2446   dst->i[1] = src0->i[1] / src1->i[1];
2447   dst->i[2] = src0->i[2] / src1->i[2];
2448   dst->i[3] = src0->i[3] / src1->i[3];
2449}
2450
2451static void
2452micro_imax(union tgsi_exec_channel *dst,
2453           const union tgsi_exec_channel *src0,
2454           const union tgsi_exec_channel *src1)
2455{
2456   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
2457   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
2458   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
2459   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
2460}
2461
2462static void
2463micro_imin(union tgsi_exec_channel *dst,
2464           const union tgsi_exec_channel *src0,
2465           const union tgsi_exec_channel *src1)
2466{
2467   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
2468   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
2469   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
2470   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
2471}
2472
2473static void
2474micro_isge(union tgsi_exec_channel *dst,
2475           const union tgsi_exec_channel *src0,
2476           const union tgsi_exec_channel *src1)
2477{
2478   dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
2479   dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
2480   dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
2481   dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
2482}
2483
2484static void
2485micro_ishr(union tgsi_exec_channel *dst,
2486           const union tgsi_exec_channel *src0,
2487           const union tgsi_exec_channel *src1)
2488{
2489   dst->i[0] = src0->i[0] >> src1->i[0];
2490   dst->i[1] = src0->i[1] >> src1->i[1];
2491   dst->i[2] = src0->i[2] >> src1->i[2];
2492   dst->i[3] = src0->i[3] >> src1->i[3];
2493}
2494
2495static void
2496micro_islt(union tgsi_exec_channel *dst,
2497           const union tgsi_exec_channel *src0,
2498           const union tgsi_exec_channel *src1)
2499{
2500   dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
2501   dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
2502   dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
2503   dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
2504}
2505
2506static void
2507micro_f2u(union tgsi_exec_channel *dst,
2508          const union tgsi_exec_channel *src)
2509{
2510   dst->u[0] = (uint)src->f[0];
2511   dst->u[1] = (uint)src->f[1];
2512   dst->u[2] = (uint)src->f[2];
2513   dst->u[3] = (uint)src->f[3];
2514}
2515
2516static void
2517micro_u2f(union tgsi_exec_channel *dst,
2518          const union tgsi_exec_channel *src)
2519{
2520   dst->f[0] = (float)src->u[0];
2521   dst->f[1] = (float)src->u[1];
2522   dst->f[2] = (float)src->u[2];
2523   dst->f[3] = (float)src->u[3];
2524}
2525
2526static void
2527micro_uadd(union tgsi_exec_channel *dst,
2528           const union tgsi_exec_channel *src0,
2529           const union tgsi_exec_channel *src1)
2530{
2531   dst->u[0] = src0->u[0] + src1->u[0];
2532   dst->u[1] = src0->u[1] + src1->u[1];
2533   dst->u[2] = src0->u[2] + src1->u[2];
2534   dst->u[3] = src0->u[3] + src1->u[3];
2535}
2536
2537static void
2538micro_udiv(union tgsi_exec_channel *dst,
2539           const union tgsi_exec_channel *src0,
2540           const union tgsi_exec_channel *src1)
2541{
2542   dst->u[0] = src0->u[0] / src1->u[0];
2543   dst->u[1] = src0->u[1] / src1->u[1];
2544   dst->u[2] = src0->u[2] / src1->u[2];
2545   dst->u[3] = src0->u[3] / src1->u[3];
2546}
2547
2548static void
2549micro_umad(union tgsi_exec_channel *dst,
2550           const union tgsi_exec_channel *src0,
2551           const union tgsi_exec_channel *src1,
2552           const union tgsi_exec_channel *src2)
2553{
2554   dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
2555   dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
2556   dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
2557   dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
2558}
2559
2560static void
2561micro_umax(union tgsi_exec_channel *dst,
2562           const union tgsi_exec_channel *src0,
2563           const union tgsi_exec_channel *src1)
2564{
2565   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
2566   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
2567   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
2568   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
2569}
2570
2571static void
2572micro_umin(union tgsi_exec_channel *dst,
2573           const union tgsi_exec_channel *src0,
2574           const union tgsi_exec_channel *src1)
2575{
2576   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
2577   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
2578   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
2579   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
2580}
2581
2582static void
2583micro_umod(union tgsi_exec_channel *dst,
2584           const union tgsi_exec_channel *src0,
2585           const union tgsi_exec_channel *src1)
2586{
2587   dst->u[0] = src0->u[0] % src1->u[0];
2588   dst->u[1] = src0->u[1] % src1->u[1];
2589   dst->u[2] = src0->u[2] % src1->u[2];
2590   dst->u[3] = src0->u[3] % src1->u[3];
2591}
2592
2593static void
2594micro_umul(union tgsi_exec_channel *dst,
2595           const union tgsi_exec_channel *src0,
2596           const union tgsi_exec_channel *src1)
2597{
2598   dst->u[0] = src0->u[0] * src1->u[0];
2599   dst->u[1] = src0->u[1] * src1->u[1];
2600   dst->u[2] = src0->u[2] * src1->u[2];
2601   dst->u[3] = src0->u[3] * src1->u[3];
2602}
2603
2604static void
2605micro_useq(union tgsi_exec_channel *dst,
2606           const union tgsi_exec_channel *src0,
2607           const union tgsi_exec_channel *src1)
2608{
2609   dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
2610   dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
2611   dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
2612   dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
2613}
2614
2615static void
2616micro_usge(union tgsi_exec_channel *dst,
2617           const union tgsi_exec_channel *src0,
2618           const union tgsi_exec_channel *src1)
2619{
2620   dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
2621   dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
2622   dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
2623   dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
2624}
2625
2626static void
2627micro_ushr(union tgsi_exec_channel *dst,
2628           const union tgsi_exec_channel *src0,
2629           const union tgsi_exec_channel *src1)
2630{
2631   dst->u[0] = src0->u[0] >> src1->u[0];
2632   dst->u[1] = src0->u[1] >> src1->u[1];
2633   dst->u[2] = src0->u[2] >> src1->u[2];
2634   dst->u[3] = src0->u[3] >> src1->u[3];
2635}
2636
2637static void
2638micro_uslt(union tgsi_exec_channel *dst,
2639           const union tgsi_exec_channel *src0,
2640           const union tgsi_exec_channel *src1)
2641{
2642   dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
2643   dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
2644   dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
2645   dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
2646}
2647
2648static void
2649micro_usne(union tgsi_exec_channel *dst,
2650           const union tgsi_exec_channel *src0,
2651           const union tgsi_exec_channel *src1)
2652{
2653   dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
2654   dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
2655   dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
2656   dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
2657}
2658
2659static void
2660exec_instruction(
2661   struct tgsi_exec_machine *mach,
2662   const struct tgsi_full_instruction *inst,
2663   int *pc )
2664{
2665   uint chan_index;
2666   union tgsi_exec_channel r[10];
2667   union tgsi_exec_channel d[8];
2668
2669   (*pc)++;
2670
2671   switch (inst->Instruction.Opcode) {
2672   case TGSI_OPCODE_ARL:
2673      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
2674      break;
2675
2676   case TGSI_OPCODE_MOV:
2677      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
2678      break;
2679
2680   case TGSI_OPCODE_LIT:
2681      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2682         FETCH( &r[0], 0, CHAN_X );
2683         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2684            micro_max(&d[CHAN_Y], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2685         }
2686
2687         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2688            FETCH( &r[1], 0, CHAN_Y );
2689            micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2690
2691            FETCH( &r[2], 0, CHAN_W );
2692            micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
2693            micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
2694            micro_pow( &r[1], &r[1], &r[2] );
2695            micro_lt(&d[CHAN_Z], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2696         }
2697
2698         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2699            STORE(&d[CHAN_Y], 0, CHAN_Y);
2700         }
2701         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2702            STORE(&d[CHAN_Z], 0, CHAN_Z);
2703         }
2704      }
2705      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2706         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2707      }
2708      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2709         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2710      }
2711      break;
2712
2713   case TGSI_OPCODE_RCP:
2714      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2715      break;
2716
2717   case TGSI_OPCODE_RSQ:
2718      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2719      break;
2720
2721   case TGSI_OPCODE_EXP:
2722      FETCH( &r[0], 0, CHAN_X );
2723      micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
2724      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2725         micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
2726         STORE( &r[2], 0, CHAN_X );        /* store r2 */
2727      }
2728      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2729         micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
2730         STORE( &r[2], 0, CHAN_Y );        /* store r2 */
2731      }
2732      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2733         micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
2734         STORE( &r[2], 0, CHAN_Z );        /* store r2 */
2735      }
2736      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2737         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2738      }
2739      break;
2740
2741   case TGSI_OPCODE_LOG:
2742      FETCH( &r[0], 0, CHAN_X );
2743      micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
2744      micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
2745      micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
2746      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2747         STORE( &r[0], 0, CHAN_X );
2748      }
2749      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2750         micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
2751         micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
2752         STORE( &r[0], 0, CHAN_Y );
2753      }
2754      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2755         STORE( &r[1], 0, CHAN_Z );
2756      }
2757      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2758         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2759      }
2760      break;
2761
2762   case TGSI_OPCODE_MUL:
2763      exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2764      break;
2765
2766   case TGSI_OPCODE_ADD:
2767      exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2768      break;
2769
2770   case TGSI_OPCODE_DP3:
2771      exec_dp3(mach, inst);
2772      break;
2773
2774   case TGSI_OPCODE_DP4:
2775      exec_dp4(mach, inst);
2776      break;
2777
2778   case TGSI_OPCODE_DST:
2779      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2780         FETCH( &r[0], 0, CHAN_Y );
2781         FETCH( &r[1], 1, CHAN_Y);
2782         micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2783      }
2784      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2785         FETCH(&d[CHAN_Z], 0, CHAN_Z);
2786      }
2787      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2788         FETCH(&d[CHAN_W], 1, CHAN_W);
2789      }
2790
2791      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2792         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X);
2793      }
2794      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2795         STORE(&d[CHAN_Y], 0, CHAN_Y);
2796      }
2797      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2798         STORE(&d[CHAN_Z], 0, CHAN_Z);
2799      }
2800      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2801         STORE(&d[CHAN_W], 0, CHAN_W);
2802      }
2803      break;
2804
2805   case TGSI_OPCODE_MIN:
2806      exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2807      break;
2808
2809   case TGSI_OPCODE_MAX:
2810      exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2811      break;
2812
2813   case TGSI_OPCODE_SLT:
2814      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2815      break;
2816
2817   case TGSI_OPCODE_SGE:
2818      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2819      break;
2820
2821   case TGSI_OPCODE_MAD:
2822      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2823      break;
2824
2825   case TGSI_OPCODE_SUB:
2826      exec_vector_binary(mach, inst, micro_sub, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2827      break;
2828
2829   case TGSI_OPCODE_LRP:
2830      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2831      break;
2832
2833   case TGSI_OPCODE_CND:
2834      exec_vector_trinary(mach, inst, micro_cnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2835      break;
2836
2837   case TGSI_OPCODE_DP2A:
2838      exec_dp2a(mach, inst);
2839      break;
2840
2841   case TGSI_OPCODE_FRC:
2842      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2843      break;
2844
2845   case TGSI_OPCODE_CLAMP:
2846      exec_vector_trinary(mach, inst, micro_clamp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2847      break;
2848
2849   case TGSI_OPCODE_FLR:
2850      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2851      break;
2852
2853   case TGSI_OPCODE_ROUND:
2854      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2855      break;
2856
2857   case TGSI_OPCODE_EX2:
2858      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2859      break;
2860
2861   case TGSI_OPCODE_LG2:
2862      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2863      break;
2864
2865   case TGSI_OPCODE_POW:
2866      FETCH(&r[0], 0, CHAN_X);
2867      FETCH(&r[1], 1, CHAN_X);
2868
2869      micro_pow( &r[0], &r[0], &r[1] );
2870
2871      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2872         STORE( &r[0], 0, chan_index );
2873      }
2874      break;
2875
2876   case TGSI_OPCODE_XPD:
2877      FETCH(&r[0], 0, CHAN_Y);
2878      FETCH(&r[1], 1, CHAN_Z);
2879
2880      micro_mul( &r[2], &r[0], &r[1] );
2881
2882      FETCH(&r[3], 0, CHAN_Z);
2883      FETCH(&r[4], 1, CHAN_Y);
2884
2885      micro_mul( &r[5], &r[3], &r[4] );
2886      micro_sub(&d[CHAN_X], &r[2], &r[5]);
2887
2888      FETCH(&r[2], 1, CHAN_X);
2889
2890      micro_mul( &r[3], &r[3], &r[2] );
2891
2892      FETCH(&r[5], 0, CHAN_X);
2893
2894      micro_mul( &r[1], &r[1], &r[5] );
2895      micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2896
2897      micro_mul( &r[5], &r[5], &r[4] );
2898      micro_mul( &r[0], &r[0], &r[2] );
2899      micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2900
2901      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2902         STORE(&d[CHAN_X], 0, CHAN_X);
2903      }
2904      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2905         STORE(&d[CHAN_Y], 0, CHAN_Y);
2906      }
2907      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2908         STORE(&d[CHAN_Z], 0, CHAN_Z);
2909      }
2910      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2911         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2912      }
2913      break;
2914
2915   case TGSI_OPCODE_ABS:
2916      exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2917      break;
2918
2919   case TGSI_OPCODE_RCC:
2920      FETCH(&r[0], 0, CHAN_X);
2921      micro_div(&r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0]);
2922      micro_float_clamp(&r[0], &r[0]);
2923      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2924         STORE(&r[0], 0, chan_index);
2925      }
2926      break;
2927
2928   case TGSI_OPCODE_DPH:
2929      exec_dph(mach, inst);
2930      break;
2931
2932   case TGSI_OPCODE_COS:
2933      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2934      break;
2935
2936   case TGSI_OPCODE_DDX:
2937      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2938      break;
2939
2940   case TGSI_OPCODE_DDY:
2941      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2942      break;
2943
2944   case TGSI_OPCODE_KILP:
2945      exec_kilp (mach, inst);
2946      break;
2947
2948   case TGSI_OPCODE_KIL:
2949      exec_kil (mach, inst);
2950      break;
2951
2952   case TGSI_OPCODE_PK2H:
2953      assert (0);
2954      break;
2955
2956   case TGSI_OPCODE_PK2US:
2957      assert (0);
2958      break;
2959
2960   case TGSI_OPCODE_PK4B:
2961      assert (0);
2962      break;
2963
2964   case TGSI_OPCODE_PK4UB:
2965      assert (0);
2966      break;
2967
2968   case TGSI_OPCODE_RFL:
2969      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2970          IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2971          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2972         /* r0 = dp3(src0, src0) */
2973         FETCH(&r[2], 0, CHAN_X);
2974         micro_mul(&r[0], &r[2], &r[2]);
2975         FETCH(&r[4], 0, CHAN_Y);
2976         micro_mul(&r[8], &r[4], &r[4]);
2977         micro_add(&r[0], &r[0], &r[8]);
2978         FETCH(&r[6], 0, CHAN_Z);
2979         micro_mul(&r[8], &r[6], &r[6]);
2980         micro_add(&r[0], &r[0], &r[8]);
2981
2982         /* r1 = dp3(src0, src1) */
2983         FETCH(&r[3], 1, CHAN_X);
2984         micro_mul(&r[1], &r[2], &r[3]);
2985         FETCH(&r[5], 1, CHAN_Y);
2986         micro_mul(&r[8], &r[4], &r[5]);
2987         micro_add(&r[1], &r[1], &r[8]);
2988         FETCH(&r[7], 1, CHAN_Z);
2989         micro_mul(&r[8], &r[6], &r[7]);
2990         micro_add(&r[1], &r[1], &r[8]);
2991
2992         /* r1 = 2 * r1 / r0 */
2993         micro_add(&r[1], &r[1], &r[1]);
2994         micro_div(&r[1], &r[1], &r[0]);
2995
2996         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2997            micro_mul(&r[2], &r[2], &r[1]);
2998            micro_sub(&r[2], &r[2], &r[3]);
2999            STORE(&r[2], 0, CHAN_X);
3000         }
3001         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
3002            micro_mul(&r[4], &r[4], &r[1]);
3003            micro_sub(&r[4], &r[4], &r[5]);
3004            STORE(&r[4], 0, CHAN_Y);
3005         }
3006         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
3007            micro_mul(&r[6], &r[6], &r[1]);
3008            micro_sub(&r[6], &r[6], &r[7]);
3009            STORE(&r[6], 0, CHAN_Z);
3010         }
3011      }
3012      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
3013         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
3014      }
3015      break;
3016
3017   case TGSI_OPCODE_SEQ:
3018      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3019      break;
3020
3021   case TGSI_OPCODE_SFL:
3022      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3023         STORE(&mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, chan_index);
3024      }
3025      break;
3026
3027   case TGSI_OPCODE_SGT:
3028      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3029      break;
3030
3031   case TGSI_OPCODE_SIN:
3032      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3033      break;
3034
3035   case TGSI_OPCODE_SLE:
3036      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3037      break;
3038
3039   case TGSI_OPCODE_SNE:
3040      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3041      break;
3042
3043   case TGSI_OPCODE_STR:
3044      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
3045         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, chan_index);
3046      }
3047      break;
3048
3049   case TGSI_OPCODE_TEX:
3050      /* simple texture lookup */
3051      /* src[0] = texcoord */
3052      /* src[1] = sampler unit */
3053      exec_tex(mach, inst, TEX_MODIFIER_NONE);
3054      break;
3055
3056   case TGSI_OPCODE_TXB:
3057      /* Texture lookup with lod bias */
3058      /* src[0] = texcoord (src[0].w = LOD bias) */
3059      /* src[1] = sampler unit */
3060      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
3061      break;
3062
3063   case TGSI_OPCODE_TXD:
3064      /* Texture lookup with explict partial derivatives */
3065      /* src[0] = texcoord */
3066      /* src[1] = d[strq]/dx */
3067      /* src[2] = d[strq]/dy */
3068      /* src[3] = sampler unit */
3069      exec_txd(mach, inst);
3070      break;
3071
3072   case TGSI_OPCODE_TXL:
3073      /* Texture lookup with explit LOD */
3074      /* src[0] = texcoord (src[0].w = LOD) */
3075      /* src[1] = sampler unit */
3076      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
3077      break;
3078
3079   case TGSI_OPCODE_TXP:
3080      /* Texture lookup with projection */
3081      /* src[0] = texcoord (src[0].w = projection) */
3082      /* src[1] = sampler unit */
3083      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
3084      break;
3085
3086   case TGSI_OPCODE_UP2H:
3087      assert (0);
3088      break;
3089
3090   case TGSI_OPCODE_UP2US:
3091      assert (0);
3092      break;
3093
3094   case TGSI_OPCODE_UP4B:
3095      assert (0);
3096      break;
3097
3098   case TGSI_OPCODE_UP4UB:
3099      assert (0);
3100      break;
3101
3102   case TGSI_OPCODE_X2D:
3103      FETCH(&r[0], 1, CHAN_X);
3104      FETCH(&r[1], 1, CHAN_Y);
3105      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
3106          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
3107         FETCH(&r[2], 2, CHAN_X);
3108         micro_mul(&r[2], &r[2], &r[0]);
3109         FETCH(&r[3], 2, CHAN_Y);
3110         micro_mul(&r[3], &r[3], &r[1]);
3111         micro_add(&r[2], &r[2], &r[3]);
3112         FETCH(&r[3], 0, CHAN_X);
3113         micro_add(&d[CHAN_X], &r[2], &r[3]);
3114
3115      }
3116      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
3117          IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
3118         FETCH(&r[2], 2, CHAN_Z);
3119         micro_mul(&r[2], &r[2], &r[0]);
3120         FETCH(&r[3], 2, CHAN_W);
3121         micro_mul(&r[3], &r[3], &r[1]);
3122         micro_add(&r[2], &r[2], &r[3]);
3123         FETCH(&r[3], 0, CHAN_Y);
3124         micro_add(&d[CHAN_Y], &r[2], &r[3]);
3125
3126      }
3127      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
3128         STORE(&d[CHAN_X], 0, CHAN_X);
3129      }
3130      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
3131         STORE(&d[CHAN_Y], 0, CHAN_Y);
3132      }
3133      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
3134         STORE(&d[CHAN_X], 0, CHAN_Z);
3135      }
3136      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
3137         STORE(&d[CHAN_Y], 0, CHAN_W);
3138      }
3139      break;
3140
3141   case TGSI_OPCODE_ARA:
3142      assert (0);
3143      break;
3144
3145   case TGSI_OPCODE_ARR:
3146      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3147      break;
3148
3149   case TGSI_OPCODE_BRA:
3150      assert (0);
3151      break;
3152
3153   case TGSI_OPCODE_CAL:
3154      /* skip the call if no execution channels are enabled */
3155      if (mach->ExecMask) {
3156         /* do the call */
3157
3158         /* First, record the depths of the execution stacks.
3159          * This is important for deeply nested/looped return statements.
3160          * We have to unwind the stacks by the correct amount.  For a
3161          * real code generator, we could determine the number of entries
3162          * to pop off each stack with simple static analysis and avoid
3163          * implementing this data structure at run time.
3164          */
3165         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
3166         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
3167         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
3168         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
3169         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
3170         /* note that PC was already incremented above */
3171         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
3172
3173         mach->CallStackTop++;
3174
3175         /* Second, push the Cond, Loop, Cont, Func stacks */
3176         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3177         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3178         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3179         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3180         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3181         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
3182
3183         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3184         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3185         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3186         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3187         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3188         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
3189
3190         /* Finally, jump to the subroutine */
3191         *pc = inst->Label.Label;
3192      }
3193      break;
3194
3195   case TGSI_OPCODE_RET:
3196      mach->FuncMask &= ~mach->ExecMask;
3197      UPDATE_EXEC_MASK(mach);
3198
3199      if (mach->FuncMask == 0x0) {
3200         /* really return now (otherwise, keep executing */
3201
3202         if (mach->CallStackTop == 0) {
3203            /* returning from main() */
3204            *pc = -1;
3205            return;
3206         }
3207
3208         assert(mach->CallStackTop > 0);
3209         mach->CallStackTop--;
3210
3211         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3212         mach->CondMask = mach->CondStack[mach->CondStackTop];
3213
3214         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3215         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3216
3217         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3218         mach->ContMask = mach->ContStack[mach->ContStackTop];
3219
3220         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3221         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3222
3223         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3224         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3225
3226         assert(mach->FuncStackTop > 0);
3227         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3228
3229         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3230
3231         UPDATE_EXEC_MASK(mach);
3232      }
3233      break;
3234
3235   case TGSI_OPCODE_SSG:
3236      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3237      break;
3238
3239   case TGSI_OPCODE_CMP:
3240      exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3241      break;
3242
3243   case TGSI_OPCODE_SCS:
3244      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
3245         FETCH( &r[0], 0, CHAN_X );
3246         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
3247            micro_cos(&r[1], &r[0]);
3248            STORE(&r[1], 0, CHAN_X);
3249         }
3250         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
3251            micro_sin(&r[1], &r[0]);
3252            STORE(&r[1], 0, CHAN_Y);
3253         }
3254      }
3255      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
3256         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
3257      }
3258      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
3259         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
3260      }
3261      break;
3262
3263   case TGSI_OPCODE_NRM:
3264      exec_nrm3(mach, inst);
3265      break;
3266
3267   case TGSI_OPCODE_NRM4:
3268      exec_nrm4(mach, inst);
3269      break;
3270
3271   case TGSI_OPCODE_DIV:
3272      exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3273      break;
3274
3275   case TGSI_OPCODE_DP2:
3276      exec_dp2(mach, inst);
3277      break;
3278
3279   case TGSI_OPCODE_IF:
3280      /* push CondMask */
3281      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3282      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3283      FETCH( &r[0], 0, CHAN_X );
3284      /* update CondMask */
3285      if( ! r[0].u[0] ) {
3286         mach->CondMask &= ~0x1;
3287      }
3288      if( ! r[0].u[1] ) {
3289         mach->CondMask &= ~0x2;
3290      }
3291      if( ! r[0].u[2] ) {
3292         mach->CondMask &= ~0x4;
3293      }
3294      if( ! r[0].u[3] ) {
3295         mach->CondMask &= ~0x8;
3296      }
3297      UPDATE_EXEC_MASK(mach);
3298      /* Todo: If CondMask==0, jump to ELSE */
3299      break;
3300
3301   case TGSI_OPCODE_ELSE:
3302      /* invert CondMask wrt previous mask */
3303      {
3304         uint prevMask;
3305         assert(mach->CondStackTop > 0);
3306         prevMask = mach->CondStack[mach->CondStackTop - 1];
3307         mach->CondMask = ~mach->CondMask & prevMask;
3308         UPDATE_EXEC_MASK(mach);
3309         /* Todo: If CondMask==0, jump to ENDIF */
3310      }
3311      break;
3312
3313   case TGSI_OPCODE_ENDIF:
3314      /* pop CondMask */
3315      assert(mach->CondStackTop > 0);
3316      mach->CondMask = mach->CondStack[--mach->CondStackTop];
3317      UPDATE_EXEC_MASK(mach);
3318      break;
3319
3320   case TGSI_OPCODE_END:
3321      /* make sure we end primitives which haven't
3322       * been explicitly emitted */
3323      conditional_emit_primitive(mach);
3324      /* halt execution */
3325      *pc = -1;
3326      break;
3327
3328   case TGSI_OPCODE_PUSHA:
3329      assert (0);
3330      break;
3331
3332   case TGSI_OPCODE_POPA:
3333      assert (0);
3334      break;
3335
3336   case TGSI_OPCODE_CEIL:
3337      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3338      break;
3339
3340   case TGSI_OPCODE_I2F:
3341      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
3342      break;
3343
3344   case TGSI_OPCODE_NOT:
3345      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3346      break;
3347
3348   case TGSI_OPCODE_TRUNC:
3349      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3350      break;
3351
3352   case TGSI_OPCODE_SHL:
3353      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3354      break;
3355
3356   case TGSI_OPCODE_AND:
3357      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3358      break;
3359
3360   case TGSI_OPCODE_OR:
3361      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3362      break;
3363
3364   case TGSI_OPCODE_MOD:
3365      assert (0);
3366      break;
3367
3368   case TGSI_OPCODE_XOR:
3369      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3370      break;
3371
3372   case TGSI_OPCODE_SAD:
3373      assert (0);
3374      break;
3375
3376   case TGSI_OPCODE_TXF:
3377      assert (0);
3378      break;
3379
3380   case TGSI_OPCODE_TXQ:
3381      assert (0);
3382      break;
3383
3384   case TGSI_OPCODE_EMIT:
3385      emit_vertex(mach);
3386      break;
3387
3388   case TGSI_OPCODE_ENDPRIM:
3389      emit_primitive(mach);
3390      break;
3391
3392   case TGSI_OPCODE_BGNLOOP:
3393      /* push LoopMask and ContMasks */
3394      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3395      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3396      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3397      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3398
3399      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3400      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3401      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3402      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3403      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
3404      break;
3405
3406   case TGSI_OPCODE_ENDLOOP:
3407      /* Restore ContMask, but don't pop */
3408      assert(mach->ContStackTop > 0);
3409      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3410      UPDATE_EXEC_MASK(mach);
3411      if (mach->ExecMask) {
3412         /* repeat loop: jump to instruction just past BGNLOOP */
3413         assert(mach->LoopLabelStackTop > 0);
3414         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3415      }
3416      else {
3417         /* exit loop: pop LoopMask */
3418         assert(mach->LoopStackTop > 0);
3419         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3420         /* pop ContMask */
3421         assert(mach->ContStackTop > 0);
3422         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3423         assert(mach->LoopLabelStackTop > 0);
3424         --mach->LoopLabelStackTop;
3425
3426         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3427      }
3428      UPDATE_EXEC_MASK(mach);
3429      break;
3430
3431   case TGSI_OPCODE_BRK:
3432      exec_break(mach);
3433      break;
3434
3435   case TGSI_OPCODE_CONT:
3436      /* turn off cont channels for each enabled exec channel */
3437      mach->ContMask &= ~mach->ExecMask;
3438      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3439      UPDATE_EXEC_MASK(mach);
3440      break;
3441
3442   case TGSI_OPCODE_BGNSUB:
3443      /* no-op */
3444      break;
3445
3446   case TGSI_OPCODE_ENDSUB:
3447      /*
3448       * XXX: This really should be a no-op. We should never reach this opcode.
3449       */
3450
3451      assert(mach->CallStackTop > 0);
3452      mach->CallStackTop--;
3453
3454      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3455      mach->CondMask = mach->CondStack[mach->CondStackTop];
3456
3457      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3458      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3459
3460      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3461      mach->ContMask = mach->ContStack[mach->ContStackTop];
3462
3463      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3464      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3465
3466      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3467      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3468
3469      assert(mach->FuncStackTop > 0);
3470      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3471
3472      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3473
3474      UPDATE_EXEC_MASK(mach);
3475      break;
3476
3477   case TGSI_OPCODE_NOP:
3478      break;
3479
3480   case TGSI_OPCODE_BREAKC:
3481      FETCH(&r[0], 0, CHAN_X);
3482      /* update CondMask */
3483      if (r[0].u[0] && (mach->ExecMask & 0x1)) {
3484         mach->LoopMask &= ~0x1;
3485      }
3486      if (r[0].u[1] && (mach->ExecMask & 0x2)) {
3487         mach->LoopMask &= ~0x2;
3488      }
3489      if (r[0].u[2] && (mach->ExecMask & 0x4)) {
3490         mach->LoopMask &= ~0x4;
3491      }
3492      if (r[0].u[3] && (mach->ExecMask & 0x8)) {
3493         mach->LoopMask &= ~0x8;
3494      }
3495      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3496      UPDATE_EXEC_MASK(mach);
3497      break;
3498
3499   case TGSI_OPCODE_F2I:
3500      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3501      break;
3502
3503   case TGSI_OPCODE_IDIV:
3504      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3505      break;
3506
3507   case TGSI_OPCODE_IMAX:
3508      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3509      break;
3510
3511   case TGSI_OPCODE_IMIN:
3512      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3513      break;
3514
3515   case TGSI_OPCODE_INEG:
3516      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3517      break;
3518
3519   case TGSI_OPCODE_ISGE:
3520      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3521      break;
3522
3523   case TGSI_OPCODE_ISHR:
3524      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3525      break;
3526
3527   case TGSI_OPCODE_ISLT:
3528      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3529      break;
3530
3531   case TGSI_OPCODE_F2U:
3532      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3533      break;
3534
3535   case TGSI_OPCODE_U2F:
3536      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
3537      break;
3538
3539   case TGSI_OPCODE_UADD:
3540      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3541      break;
3542
3543   case TGSI_OPCODE_UDIV:
3544      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3545      break;
3546
3547   case TGSI_OPCODE_UMAD:
3548      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3549      break;
3550
3551   case TGSI_OPCODE_UMAX:
3552      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3553      break;
3554
3555   case TGSI_OPCODE_UMIN:
3556      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3557      break;
3558
3559   case TGSI_OPCODE_UMOD:
3560      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3561      break;
3562
3563   case TGSI_OPCODE_UMUL:
3564      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3565      break;
3566
3567   case TGSI_OPCODE_USEQ:
3568      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3569      break;
3570
3571   case TGSI_OPCODE_USGE:
3572      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3573      break;
3574
3575   case TGSI_OPCODE_USHR:
3576      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3577      break;
3578
3579   case TGSI_OPCODE_USLT:
3580      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3581      break;
3582
3583   case TGSI_OPCODE_USNE:
3584      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3585      break;
3586
3587   case TGSI_OPCODE_SWITCH:
3588      exec_switch(mach, inst);
3589      break;
3590
3591   case TGSI_OPCODE_CASE:
3592      exec_case(mach, inst);
3593      break;
3594
3595   case TGSI_OPCODE_DEFAULT:
3596      exec_default(mach);
3597      break;
3598
3599   case TGSI_OPCODE_ENDSWITCH:
3600      exec_endswitch(mach);
3601      break;
3602
3603   default:
3604      assert( 0 );
3605   }
3606}
3607
3608
3609#define DEBUG_EXECUTION 0
3610
3611
3612/**
3613 * Run TGSI interpreter.
3614 * \return bitmask of "alive" quad components
3615 */
3616uint
3617tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
3618{
3619   uint i;
3620   int pc = 0;
3621
3622   mach->CondMask = 0xf;
3623   mach->LoopMask = 0xf;
3624   mach->ContMask = 0xf;
3625   mach->FuncMask = 0xf;
3626   mach->ExecMask = 0xf;
3627
3628   mach->Switch.mask = 0xf;
3629
3630   assert(mach->CondStackTop == 0);
3631   assert(mach->LoopStackTop == 0);
3632   assert(mach->ContStackTop == 0);
3633   assert(mach->SwitchStackTop == 0);
3634   assert(mach->BreakStackTop == 0);
3635   assert(mach->CallStackTop == 0);
3636
3637   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
3638   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
3639
3640   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
3641      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
3642      mach->Primitives[0] = 0;
3643   }
3644
3645   for (i = 0; i < QUAD_SIZE; i++) {
3646      mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
3647         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
3648         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
3649         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
3650         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
3651   }
3652
3653   /* execute declarations (interpolants) */
3654   for (i = 0; i < mach->NumDeclarations; i++) {
3655      exec_declaration( mach, mach->Declarations+i );
3656   }
3657
3658   {
3659#if DEBUG_EXECUTION
3660      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
3661      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
3662      uint inst = 1;
3663
3664      memcpy(temps, mach->Temps, sizeof(temps));
3665      memcpy(outputs, mach->Outputs, sizeof(outputs));
3666#endif
3667
3668      /* execute instructions, until pc is set to -1 */
3669      while (pc != -1) {
3670
3671#if DEBUG_EXECUTION
3672         uint i;
3673
3674         tgsi_dump_instruction(&mach->Instructions[pc], inst++);
3675#endif
3676
3677         assert(pc < (int) mach->NumInstructions);
3678         exec_instruction(mach, mach->Instructions + pc, &pc);
3679
3680#if DEBUG_EXECUTION
3681         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
3682            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
3683               uint j;
3684
3685               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
3686               debug_printf("TEMP[%2u] = ", i);
3687               for (j = 0; j < 4; j++) {
3688                  if (j > 0) {
3689                     debug_printf("           ");
3690                  }
3691                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3692                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
3693                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
3694                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
3695                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
3696               }
3697            }
3698         }
3699         for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
3700            if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
3701               uint j;
3702
3703               memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
3704               debug_printf("OUT[%2u] =  ", i);
3705               for (j = 0; j < 4; j++) {
3706                  if (j > 0) {
3707                     debug_printf("           ");
3708                  }
3709                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3710                               outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
3711                               outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
3712                               outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
3713                               outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
3714               }
3715            }
3716         }
3717#endif
3718      }
3719   }
3720
3721#if 0
3722   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
3723   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
3724      /*
3725       * Scale back depth component.
3726       */
3727      for (i = 0; i < 4; i++)
3728         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
3729   }
3730#endif
3731
3732   assert(mach->CondStackTop == 0);
3733   assert(mach->LoopStackTop == 0);
3734   assert(mach->ContStackTop == 0);
3735   assert(mach->SwitchStackTop == 0);
3736   assert(mach->BreakStackTop == 0);
3737   assert(mach->CallStackTop == 0);
3738
3739   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3740}
3741