tgsi_exec.c revision d562f97bef99e051842ae0cec8f5ac46a10a73c4
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29/**
30 * TGSI interpreter/executor.
31 *
32 * Flow control information:
33 *
34 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
35 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
36 * care since a condition may be true for some quad components but false
37 * for other components.
38 *
39 * We basically execute all statements (even if they're in the part of
40 * an IF/ELSE clause that's "not taken") and use a special mask to
41 * control writing to destination registers.  This is the ExecMask.
42 * See store_dest().
43 *
44 * The ExecMask is computed from three other masks (CondMask, LoopMask and
45 * ContMask) which are controlled by the flow control instructions (namely:
46 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
47 *
48 *
49 * Authors:
50 *   Michal Krol
51 *   Brian Paul
52 */
53
54#include "pipe/p_compiler.h"
55#include "pipe/p_state.h"
56#include "pipe/p_shader_tokens.h"
57#include "tgsi/tgsi_dump.h"
58#include "tgsi/tgsi_parse.h"
59#include "tgsi/tgsi_util.h"
60#include "tgsi_exec.h"
61#include "util/u_memory.h"
62#include "util/u_math.h"
63
64
65#define FAST_MATH 1
66
67#define TILE_TOP_LEFT     0
68#define TILE_TOP_RIGHT    1
69#define TILE_BOTTOM_LEFT  2
70#define TILE_BOTTOM_RIGHT 3
71
72static void
73micro_abs(union tgsi_exec_channel *dst,
74          const union tgsi_exec_channel *src)
75{
76   dst->f[0] = fabsf(src->f[0]);
77   dst->f[1] = fabsf(src->f[1]);
78   dst->f[2] = fabsf(src->f[2]);
79   dst->f[3] = fabsf(src->f[3]);
80}
81
82static void
83micro_arl(union tgsi_exec_channel *dst,
84          const union tgsi_exec_channel *src)
85{
86   dst->i[0] = (int)floorf(src->f[0]);
87   dst->i[1] = (int)floorf(src->f[1]);
88   dst->i[2] = (int)floorf(src->f[2]);
89   dst->i[3] = (int)floorf(src->f[3]);
90}
91
92static void
93micro_arr(union tgsi_exec_channel *dst,
94          const union tgsi_exec_channel *src)
95{
96   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
97   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
98   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
99   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
100}
101
102static void
103micro_ceil(union tgsi_exec_channel *dst,
104           const union tgsi_exec_channel *src)
105{
106   dst->f[0] = ceilf(src->f[0]);
107   dst->f[1] = ceilf(src->f[1]);
108   dst->f[2] = ceilf(src->f[2]);
109   dst->f[3] = ceilf(src->f[3]);
110}
111
112static void
113micro_clamp(union tgsi_exec_channel *dst,
114            const union tgsi_exec_channel *src0,
115            const union tgsi_exec_channel *src1,
116            const union tgsi_exec_channel *src2)
117{
118   dst->f[0] = src0->f[0] < src1->f[0] ? src1->f[0] : src0->f[0] > src2->f[0] ? src2->f[0] : src0->f[0];
119   dst->f[1] = src0->f[1] < src1->f[1] ? src1->f[1] : src0->f[1] > src2->f[1] ? src2->f[1] : src0->f[1];
120   dst->f[2] = src0->f[2] < src1->f[2] ? src1->f[2] : src0->f[2] > src2->f[2] ? src2->f[2] : src0->f[2];
121   dst->f[3] = src0->f[3] < src1->f[3] ? src1->f[3] : src0->f[3] > src2->f[3] ? src2->f[3] : src0->f[3];
122}
123
124static void
125micro_cmp(union tgsi_exec_channel *dst,
126          const union tgsi_exec_channel *src0,
127          const union tgsi_exec_channel *src1,
128          const union tgsi_exec_channel *src2)
129{
130   dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
131   dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
132   dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
133   dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
134}
135
136static void
137micro_cnd(union tgsi_exec_channel *dst,
138          const union tgsi_exec_channel *src0,
139          const union tgsi_exec_channel *src1,
140          const union tgsi_exec_channel *src2)
141{
142   dst->f[0] = src2->f[0] > 0.5f ? src0->f[0] : src1->f[0];
143   dst->f[1] = src2->f[1] > 0.5f ? src0->f[1] : src1->f[1];
144   dst->f[2] = src2->f[2] > 0.5f ? src0->f[2] : src1->f[2];
145   dst->f[3] = src2->f[3] > 0.5f ? src0->f[3] : src1->f[3];
146}
147
148static void
149micro_cos(union tgsi_exec_channel *dst,
150          const union tgsi_exec_channel *src)
151{
152   dst->f[0] = cosf(src->f[0]);
153   dst->f[1] = cosf(src->f[1]);
154   dst->f[2] = cosf(src->f[2]);
155   dst->f[3] = cosf(src->f[3]);
156}
157
158static void
159micro_ddx(union tgsi_exec_channel *dst,
160          const union tgsi_exec_channel *src)
161{
162   dst->f[0] =
163   dst->f[1] =
164   dst->f[2] =
165   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
166}
167
168static void
169micro_ddy(union tgsi_exec_channel *dst,
170          const union tgsi_exec_channel *src)
171{
172   dst->f[0] =
173   dst->f[1] =
174   dst->f[2] =
175   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
176}
177
178static void
179micro_exp2(union tgsi_exec_channel *dst,
180           const union tgsi_exec_channel *src)
181{
182#if FAST_MATH
183   dst->f[0] = util_fast_exp2(src->f[0]);
184   dst->f[1] = util_fast_exp2(src->f[1]);
185   dst->f[2] = util_fast_exp2(src->f[2]);
186   dst->f[3] = util_fast_exp2(src->f[3]);
187#else
188#if DEBUG
189   /* Inf is okay for this instruction, so clamp it to silence assertions. */
190   uint i;
191   union tgsi_exec_channel clamped;
192
193   for (i = 0; i < 4; i++) {
194      if (src->f[i] > 127.99999f) {
195         clamped.f[i] = 127.99999f;
196      } else if (src->f[i] < -126.99999f) {
197         clamped.f[i] = -126.99999f;
198      } else {
199         clamped.f[i] = src->f[i];
200      }
201   }
202   src = &clamped;
203#endif /* DEBUG */
204
205   dst->f[0] = powf(2.0f, src->f[0]);
206   dst->f[1] = powf(2.0f, src->f[1]);
207   dst->f[2] = powf(2.0f, src->f[2]);
208   dst->f[3] = powf(2.0f, src->f[3]);
209#endif /* FAST_MATH */
210}
211
212static void
213micro_flr(union tgsi_exec_channel *dst,
214          const union tgsi_exec_channel *src)
215{
216   dst->f[0] = floorf(src->f[0]);
217   dst->f[1] = floorf(src->f[1]);
218   dst->f[2] = floorf(src->f[2]);
219   dst->f[3] = floorf(src->f[3]);
220}
221
222static void
223micro_frc(union tgsi_exec_channel *dst,
224          const union tgsi_exec_channel *src)
225{
226   dst->f[0] = src->f[0] - floorf(src->f[0]);
227   dst->f[1] = src->f[1] - floorf(src->f[1]);
228   dst->f[2] = src->f[2] - floorf(src->f[2]);
229   dst->f[3] = src->f[3] - floorf(src->f[3]);
230}
231
232static void
233micro_iabs(union tgsi_exec_channel *dst,
234           const union tgsi_exec_channel *src)
235{
236   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
237   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
238   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
239   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
240}
241
242static void
243micro_ineg(union tgsi_exec_channel *dst,
244           const union tgsi_exec_channel *src)
245{
246   dst->i[0] = -src->i[0];
247   dst->i[1] = -src->i[1];
248   dst->i[2] = -src->i[2];
249   dst->i[3] = -src->i[3];
250}
251
252static void
253micro_lg2(union tgsi_exec_channel *dst,
254          const union tgsi_exec_channel *src)
255{
256#if FAST_MATH
257   dst->f[0] = util_fast_log2(src->f[0]);
258   dst->f[1] = util_fast_log2(src->f[1]);
259   dst->f[2] = util_fast_log2(src->f[2]);
260   dst->f[3] = util_fast_log2(src->f[3]);
261#else
262   dst->f[0] = logf(src->f[0]) * 1.442695f;
263   dst->f[1] = logf(src->f[1]) * 1.442695f;
264   dst->f[2] = logf(src->f[2]) * 1.442695f;
265   dst->f[3] = logf(src->f[3]) * 1.442695f;
266#endif
267}
268
269static void
270micro_lrp(union tgsi_exec_channel *dst,
271          const union tgsi_exec_channel *src0,
272          const union tgsi_exec_channel *src1,
273          const union tgsi_exec_channel *src2)
274{
275   dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
276   dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
277   dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
278   dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
279}
280
281static void
282micro_mad(union tgsi_exec_channel *dst,
283          const union tgsi_exec_channel *src0,
284          const union tgsi_exec_channel *src1,
285          const union tgsi_exec_channel *src2)
286{
287   dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
288   dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
289   dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
290   dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
291}
292
293static void
294micro_mov(union tgsi_exec_channel *dst,
295          const union tgsi_exec_channel *src)
296{
297   dst->u[0] = src->u[0];
298   dst->u[1] = src->u[1];
299   dst->u[2] = src->u[2];
300   dst->u[3] = src->u[3];
301}
302
303static void
304micro_rcp(union tgsi_exec_channel *dst,
305          const union tgsi_exec_channel *src)
306{
307#if 0 /* for debugging */
308   assert(src->f[0] != 0.0f);
309   assert(src->f[1] != 0.0f);
310   assert(src->f[2] != 0.0f);
311   assert(src->f[3] != 0.0f);
312#endif
313   dst->f[0] = 1.0f / src->f[0];
314   dst->f[1] = 1.0f / src->f[1];
315   dst->f[2] = 1.0f / src->f[2];
316   dst->f[3] = 1.0f / src->f[3];
317}
318
319static void
320micro_rnd(union tgsi_exec_channel *dst,
321          const union tgsi_exec_channel *src)
322{
323   dst->f[0] = floorf(src->f[0] + 0.5f);
324   dst->f[1] = floorf(src->f[1] + 0.5f);
325   dst->f[2] = floorf(src->f[2] + 0.5f);
326   dst->f[3] = floorf(src->f[3] + 0.5f);
327}
328
329static void
330micro_rsq(union tgsi_exec_channel *dst,
331          const union tgsi_exec_channel *src)
332{
333#if 0 /* for debugging */
334   assert(src->f[0] != 0.0f);
335   assert(src->f[1] != 0.0f);
336   assert(src->f[2] != 0.0f);
337   assert(src->f[3] != 0.0f);
338#endif
339   dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
340   dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
341   dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
342   dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
343}
344
345static void
346micro_seq(union tgsi_exec_channel *dst,
347          const union tgsi_exec_channel *src0,
348          const union tgsi_exec_channel *src1)
349{
350   dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
351   dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
352   dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
353   dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
354}
355
356static void
357micro_sge(union tgsi_exec_channel *dst,
358          const union tgsi_exec_channel *src0,
359          const union tgsi_exec_channel *src1)
360{
361   dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
362   dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
363   dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
364   dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
365}
366
367static void
368micro_sgn(union tgsi_exec_channel *dst,
369          const union tgsi_exec_channel *src)
370{
371   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
372   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
373   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
374   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
375}
376
377static void
378micro_sgt(union tgsi_exec_channel *dst,
379          const union tgsi_exec_channel *src0,
380          const union tgsi_exec_channel *src1)
381{
382   dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
383   dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
384   dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
385   dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
386}
387
388static void
389micro_sin(union tgsi_exec_channel *dst,
390          const union tgsi_exec_channel *src)
391{
392   dst->f[0] = sinf(src->f[0]);
393   dst->f[1] = sinf(src->f[1]);
394   dst->f[2] = sinf(src->f[2]);
395   dst->f[3] = sinf(src->f[3]);
396}
397
398static void
399micro_sle(union tgsi_exec_channel *dst,
400          const union tgsi_exec_channel *src0,
401          const union tgsi_exec_channel *src1)
402{
403   dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
404   dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
405   dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
406   dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
407}
408
409static void
410micro_slt(union tgsi_exec_channel *dst,
411          const union tgsi_exec_channel *src0,
412          const union tgsi_exec_channel *src1)
413{
414   dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
415   dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
416   dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
417   dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
418}
419
420static void
421micro_sne(union tgsi_exec_channel *dst,
422          const union tgsi_exec_channel *src0,
423          const union tgsi_exec_channel *src1)
424{
425   dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
426   dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
427   dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
428   dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
429}
430
431static void
432micro_sfl(union tgsi_exec_channel *dst)
433{
434   dst->f[0] = 0.0f;
435   dst->f[1] = 0.0f;
436   dst->f[2] = 0.0f;
437   dst->f[3] = 0.0f;
438}
439
440static void
441micro_str(union tgsi_exec_channel *dst)
442{
443   dst->f[0] = 1.0f;
444   dst->f[1] = 1.0f;
445   dst->f[2] = 1.0f;
446   dst->f[3] = 1.0f;
447}
448
449static void
450micro_trunc(union tgsi_exec_channel *dst,
451            const union tgsi_exec_channel *src)
452{
453   dst->f[0] = (float)(int)src->f[0];
454   dst->f[1] = (float)(int)src->f[1];
455   dst->f[2] = (float)(int)src->f[2];
456   dst->f[3] = (float)(int)src->f[3];
457}
458
459
460#define CHAN_X  0
461#define CHAN_Y  1
462#define CHAN_Z  2
463#define CHAN_W  3
464
465enum tgsi_exec_datatype {
466   TGSI_EXEC_DATA_FLOAT,
467   TGSI_EXEC_DATA_INT,
468   TGSI_EXEC_DATA_UINT
469};
470
471/*
472 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
473 */
474#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
475#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
476#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
477#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
478#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
479#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
480
481
482/** The execution mask depends on the conditional mask and the loop mask */
483#define UPDATE_EXEC_MASK(MACH) \
484      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
485
486
487static const union tgsi_exec_channel ZeroVec =
488   { { 0.0, 0.0, 0.0, 0.0 } };
489
490static const union tgsi_exec_channel OneVec = {
491   {1.0f, 1.0f, 1.0f, 1.0f}
492};
493
494static const union tgsi_exec_channel P128Vec = {
495   {128.0f, 128.0f, 128.0f, 128.0f}
496};
497
498static const union tgsi_exec_channel M128Vec = {
499   {-128.0f, -128.0f, -128.0f, -128.0f}
500};
501
502
503/**
504 * Assert that none of the float values in 'chan' are infinite or NaN.
505 * NaN and Inf may occur normally during program execution and should
506 * not lead to crashes, etc.  But when debugging, it's helpful to catch
507 * them.
508 */
509static INLINE void
510check_inf_or_nan(const union tgsi_exec_channel *chan)
511{
512   assert(!util_is_inf_or_nan((chan)->f[0]));
513   assert(!util_is_inf_or_nan((chan)->f[1]));
514   assert(!util_is_inf_or_nan((chan)->f[2]));
515   assert(!util_is_inf_or_nan((chan)->f[3]));
516}
517
518
519#ifdef DEBUG
520static void
521print_chan(const char *msg, const union tgsi_exec_channel *chan)
522{
523   debug_printf("%s = {%f, %f, %f, %f}\n",
524                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
525}
526#endif
527
528
529#ifdef DEBUG
530static void
531print_temp(const struct tgsi_exec_machine *mach, uint index)
532{
533   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
534   int i;
535   debug_printf("Temp[%u] =\n", index);
536   for (i = 0; i < 4; i++) {
537      debug_printf("  %c: { %f, %f, %f, %f }\n",
538                   "XYZW"[i],
539                   tmp->xyzw[i].f[0],
540                   tmp->xyzw[i].f[1],
541                   tmp->xyzw[i].f[2],
542                   tmp->xyzw[i].f[3]);
543   }
544}
545#endif
546
547
548void
549tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
550                               unsigned num_bufs,
551                               const void **bufs,
552                               const unsigned *buf_sizes)
553{
554   unsigned i;
555
556   for (i = 0; i < num_bufs; i++) {
557      mach->Consts[i] = bufs[i];
558      mach->ConstsSize[i] = buf_sizes[i];
559   }
560}
561
562
563/**
564 * Check if there's a potential src/dst register data dependency when
565 * using SOA execution.
566 * Example:
567 *   MOV T, T.yxwz;
568 * This would expand into:
569 *   MOV t0, t1;
570 *   MOV t1, t0;
571 *   MOV t2, t3;
572 *   MOV t3, t2;
573 * The second instruction will have the wrong value for t0 if executed as-is.
574 */
575boolean
576tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
577{
578   uint i, chan;
579
580   uint writemask = inst->Dst[0].Register.WriteMask;
581   if (writemask == TGSI_WRITEMASK_X ||
582       writemask == TGSI_WRITEMASK_Y ||
583       writemask == TGSI_WRITEMASK_Z ||
584       writemask == TGSI_WRITEMASK_W ||
585       writemask == TGSI_WRITEMASK_NONE) {
586      /* no chance of data dependency */
587      return FALSE;
588   }
589
590   /* loop over src regs */
591   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
592      if ((inst->Src[i].Register.File ==
593           inst->Dst[0].Register.File) &&
594          ((inst->Src[i].Register.Index ==
595            inst->Dst[0].Register.Index) ||
596           inst->Src[i].Register.Indirect ||
597           inst->Dst[0].Register.Indirect)) {
598         /* loop over dest channels */
599         uint channelsWritten = 0x0;
600         for (chan = 0; chan < NUM_CHANNELS; chan++) {
601            if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
602               /* check if we're reading a channel that's been written */
603               uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
604               if (channelsWritten & (1 << swizzle)) {
605                  return TRUE;
606               }
607
608               channelsWritten |= (1 << chan);
609            }
610         }
611      }
612   }
613   return FALSE;
614}
615
616
617/**
618 * Initialize machine state by expanding tokens to full instructions,
619 * allocating temporary storage, setting up constants, etc.
620 * After this, we can call tgsi_exec_machine_run() many times.
621 */
622void
623tgsi_exec_machine_bind_shader(
624   struct tgsi_exec_machine *mach,
625   const struct tgsi_token *tokens,
626   uint numSamplers,
627   struct tgsi_sampler **samplers)
628{
629   uint k;
630   struct tgsi_parse_context parse;
631   struct tgsi_full_instruction *instructions;
632   struct tgsi_full_declaration *declarations;
633   uint maxInstructions = 10, numInstructions = 0;
634   uint maxDeclarations = 10, numDeclarations = 0;
635
636#if 0
637   tgsi_dump(tokens, 0);
638#endif
639
640   util_init_math();
641
642   if (numSamplers) {
643      assert(samplers);
644   }
645
646   mach->Tokens = tokens;
647   mach->Samplers = samplers;
648
649   if (!tokens) {
650      /* unbind and free all */
651      if (mach->Declarations) {
652         FREE( mach->Declarations );
653      }
654      mach->Declarations = NULL;
655      mach->NumDeclarations = 0;
656
657      if (mach->Instructions) {
658         FREE( mach->Instructions );
659      }
660      mach->Instructions = NULL;
661      mach->NumInstructions = 0;
662
663      return;
664   }
665
666   k = tgsi_parse_init (&parse, mach->Tokens);
667   if (k != TGSI_PARSE_OK) {
668      debug_printf( "Problem parsing!\n" );
669      return;
670   }
671
672   mach->Processor = parse.FullHeader.Processor.Processor;
673   mach->ImmLimit = 0;
674
675   if (mach->Processor == TGSI_PROCESSOR_GEOMETRY &&
676       !mach->UsedGeometryShader) {
677      struct tgsi_exec_vector *inputs =
678         align_malloc(sizeof(struct tgsi_exec_vector) *
679                      TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS,
680                      16);
681      struct tgsi_exec_vector *outputs =
682         align_malloc(sizeof(struct tgsi_exec_vector) *
683                      TGSI_MAX_TOTAL_VERTICES, 16);
684
685      if (!inputs)
686         return;
687      if (!outputs) {
688         align_free(inputs);
689         return;
690      }
691
692      align_free(mach->Inputs);
693      align_free(mach->Outputs);
694
695      mach->Inputs = inputs;
696      mach->Outputs = outputs;
697      mach->UsedGeometryShader = TRUE;
698   }
699
700   declarations = (struct tgsi_full_declaration *)
701      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
702
703   if (!declarations) {
704      return;
705   }
706
707   instructions = (struct tgsi_full_instruction *)
708      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
709
710   if (!instructions) {
711      FREE( declarations );
712      return;
713   }
714
715   while( !tgsi_parse_end_of_tokens( &parse ) ) {
716      uint i;
717
718      tgsi_parse_token( &parse );
719      switch( parse.FullToken.Token.Type ) {
720      case TGSI_TOKEN_TYPE_DECLARATION:
721         /* save expanded declaration */
722         if (numDeclarations == maxDeclarations) {
723            declarations = REALLOC(declarations,
724                                   maxDeclarations
725                                   * sizeof(struct tgsi_full_declaration),
726                                   (maxDeclarations + 10)
727                                   * sizeof(struct tgsi_full_declaration));
728            maxDeclarations += 10;
729         }
730         if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
731            unsigned reg;
732            for (reg = parse.FullToken.FullDeclaration.Range.First;
733                 reg <= parse.FullToken.FullDeclaration.Range.Last;
734                 ++reg) {
735               ++mach->NumOutputs;
736            }
737         }
738         if (parse.FullToken.FullDeclaration.Declaration.File ==
739             TGSI_FILE_IMMEDIATE_ARRAY) {
740            unsigned reg;
741            struct tgsi_full_declaration *decl =
742               &parse.FullToken.FullDeclaration;
743            debug_assert(decl->Range.Last < TGSI_EXEC_NUM_IMMEDIATES);
744            for (reg = decl->Range.First; reg <= decl->Range.Last; ++reg) {
745               for( i = 0; i < 4; i++ ) {
746                  int idx = reg * 4 + i;
747                  mach->ImmArray[reg][i] = decl->ImmediateData.u[idx].Float;
748               }
749            }
750         }
751         memcpy(declarations + numDeclarations,
752                &parse.FullToken.FullDeclaration,
753                sizeof(declarations[0]));
754         numDeclarations++;
755         break;
756
757      case TGSI_TOKEN_TYPE_IMMEDIATE:
758         {
759            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
760            assert( size <= 4 );
761            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
762
763            for( i = 0; i < size; i++ ) {
764               mach->Imms[mach->ImmLimit][i] =
765		  parse.FullToken.FullImmediate.u[i].Float;
766            }
767            mach->ImmLimit += 1;
768         }
769         break;
770
771      case TGSI_TOKEN_TYPE_INSTRUCTION:
772
773         /* save expanded instruction */
774         if (numInstructions == maxInstructions) {
775            instructions = REALLOC(instructions,
776                                   maxInstructions
777                                   * sizeof(struct tgsi_full_instruction),
778                                   (maxInstructions + 10)
779                                   * sizeof(struct tgsi_full_instruction));
780            maxInstructions += 10;
781         }
782
783         memcpy(instructions + numInstructions,
784                &parse.FullToken.FullInstruction,
785                sizeof(instructions[0]));
786
787         numInstructions++;
788         break;
789
790      case TGSI_TOKEN_TYPE_PROPERTY:
791         break;
792
793      default:
794         assert( 0 );
795      }
796   }
797   tgsi_parse_free (&parse);
798
799   if (mach->Declarations) {
800      FREE( mach->Declarations );
801   }
802   mach->Declarations = declarations;
803   mach->NumDeclarations = numDeclarations;
804
805   if (mach->Instructions) {
806      FREE( mach->Instructions );
807   }
808   mach->Instructions = instructions;
809   mach->NumInstructions = numInstructions;
810}
811
812
813struct tgsi_exec_machine *
814tgsi_exec_machine_create( void )
815{
816   struct tgsi_exec_machine *mach;
817   uint i;
818
819   mach = align_malloc( sizeof *mach, 16 );
820   if (!mach)
821      goto fail;
822
823   memset(mach, 0, sizeof(*mach));
824
825   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
826   mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
827   mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
828
829   mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_ATTRIBS, 16);
830   mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_ATTRIBS, 16);
831   if (!mach->Inputs || !mach->Outputs)
832      goto fail;
833
834   /* Setup constants needed by the SSE2 executor. */
835   for( i = 0; i < 4; i++ ) {
836      mach->Temps[TGSI_EXEC_TEMP_00000000_I].xyzw[TGSI_EXEC_TEMP_00000000_C].u[i] = 0x00000000;
837      mach->Temps[TGSI_EXEC_TEMP_7FFFFFFF_I].xyzw[TGSI_EXEC_TEMP_7FFFFFFF_C].u[i] = 0x7FFFFFFF;
838      mach->Temps[TGSI_EXEC_TEMP_80000000_I].xyzw[TGSI_EXEC_TEMP_80000000_C].u[i] = 0x80000000;
839      mach->Temps[TGSI_EXEC_TEMP_FFFFFFFF_I].xyzw[TGSI_EXEC_TEMP_FFFFFFFF_C].u[i] = 0xFFFFFFFF;    /* not used */
840      mach->Temps[TGSI_EXEC_TEMP_ONE_I].xyzw[TGSI_EXEC_TEMP_ONE_C].f[i] = 1.0f;
841      mach->Temps[TGSI_EXEC_TEMP_TWO_I].xyzw[TGSI_EXEC_TEMP_TWO_C].f[i] = 2.0f;    /* not used */
842      mach->Temps[TGSI_EXEC_TEMP_128_I].xyzw[TGSI_EXEC_TEMP_128_C].f[i] = 128.0f;
843      mach->Temps[TGSI_EXEC_TEMP_MINUS_128_I].xyzw[TGSI_EXEC_TEMP_MINUS_128_C].f[i] = -128.0f;
844      mach->Temps[TGSI_EXEC_TEMP_THREE_I].xyzw[TGSI_EXEC_TEMP_THREE_C].f[i] = 3.0f;
845      mach->Temps[TGSI_EXEC_TEMP_HALF_I].xyzw[TGSI_EXEC_TEMP_HALF_C].f[i] = 0.5f;
846   }
847
848#ifdef DEBUG
849   /* silence warnings */
850   (void) print_chan;
851   (void) print_temp;
852#endif
853
854   return mach;
855
856fail:
857   if (mach) {
858      align_free(mach->Inputs);
859      align_free(mach->Outputs);
860      align_free(mach);
861   }
862   return NULL;
863}
864
865
866void
867tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
868{
869   if (mach) {
870      if (mach->Instructions)
871         FREE(mach->Instructions);
872      if (mach->Declarations)
873         FREE(mach->Declarations);
874
875      align_free(mach->Inputs);
876      align_free(mach->Outputs);
877
878      align_free(mach);
879   }
880}
881
882static void
883micro_add(union tgsi_exec_channel *dst,
884          const union tgsi_exec_channel *src0,
885          const union tgsi_exec_channel *src1)
886{
887   dst->f[0] = src0->f[0] + src1->f[0];
888   dst->f[1] = src0->f[1] + src1->f[1];
889   dst->f[2] = src0->f[2] + src1->f[2];
890   dst->f[3] = src0->f[3] + src1->f[3];
891}
892
893static void
894micro_div(
895   union tgsi_exec_channel *dst,
896   const union tgsi_exec_channel *src0,
897   const union tgsi_exec_channel *src1 )
898{
899   if (src1->f[0] != 0) {
900      dst->f[0] = src0->f[0] / src1->f[0];
901   }
902   if (src1->f[1] != 0) {
903      dst->f[1] = src0->f[1] / src1->f[1];
904   }
905   if (src1->f[2] != 0) {
906      dst->f[2] = src0->f[2] / src1->f[2];
907   }
908   if (src1->f[3] != 0) {
909      dst->f[3] = src0->f[3] / src1->f[3];
910   }
911}
912
913static void
914micro_rcc(union tgsi_exec_channel *dst,
915          const union tgsi_exec_channel *src)
916{
917   uint i;
918
919   for (i = 0; i < 4; i++) {
920      float recip = 1.0f / src->f[i];
921
922      if (recip > 0.0f) {
923         if (recip > 1.884467e+019f) {
924            dst->f[i] = 1.884467e+019f;
925         }
926         else if (recip < 5.42101e-020f) {
927            dst->f[i] = 5.42101e-020f;
928         }
929         else {
930            dst->f[i] = recip;
931         }
932      }
933      else {
934         if (recip < -1.884467e+019f) {
935            dst->f[i] = -1.884467e+019f;
936         }
937         else if (recip > -5.42101e-020f) {
938            dst->f[i] = -5.42101e-020f;
939         }
940         else {
941            dst->f[i] = recip;
942         }
943      }
944   }
945}
946
947static void
948micro_lt(
949   union tgsi_exec_channel *dst,
950   const union tgsi_exec_channel *src0,
951   const union tgsi_exec_channel *src1,
952   const union tgsi_exec_channel *src2,
953   const union tgsi_exec_channel *src3 )
954{
955   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
956   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
957   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
958   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
959}
960
961static void
962micro_max(union tgsi_exec_channel *dst,
963          const union tgsi_exec_channel *src0,
964          const union tgsi_exec_channel *src1)
965{
966   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
967   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
968   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
969   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
970}
971
972static void
973micro_min(union tgsi_exec_channel *dst,
974          const union tgsi_exec_channel *src0,
975          const union tgsi_exec_channel *src1)
976{
977   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
978   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
979   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
980   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
981}
982
983static void
984micro_mul(union tgsi_exec_channel *dst,
985          const union tgsi_exec_channel *src0,
986          const union tgsi_exec_channel *src1)
987{
988   dst->f[0] = src0->f[0] * src1->f[0];
989   dst->f[1] = src0->f[1] * src1->f[1];
990   dst->f[2] = src0->f[2] * src1->f[2];
991   dst->f[3] = src0->f[3] * src1->f[3];
992}
993
994static void
995micro_neg(
996   union tgsi_exec_channel *dst,
997   const union tgsi_exec_channel *src )
998{
999   dst->f[0] = -src->f[0];
1000   dst->f[1] = -src->f[1];
1001   dst->f[2] = -src->f[2];
1002   dst->f[3] = -src->f[3];
1003}
1004
1005static void
1006micro_pow(
1007   union tgsi_exec_channel *dst,
1008   const union tgsi_exec_channel *src0,
1009   const union tgsi_exec_channel *src1 )
1010{
1011#if FAST_MATH
1012   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
1013   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
1014   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
1015   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
1016#else
1017   dst->f[0] = powf( src0->f[0], src1->f[0] );
1018   dst->f[1] = powf( src0->f[1], src1->f[1] );
1019   dst->f[2] = powf( src0->f[2], src1->f[2] );
1020   dst->f[3] = powf( src0->f[3], src1->f[3] );
1021#endif
1022}
1023
1024static void
1025micro_sub(union tgsi_exec_channel *dst,
1026          const union tgsi_exec_channel *src0,
1027          const union tgsi_exec_channel *src1)
1028{
1029   dst->f[0] = src0->f[0] - src1->f[0];
1030   dst->f[1] = src0->f[1] - src1->f[1];
1031   dst->f[2] = src0->f[2] - src1->f[2];
1032   dst->f[3] = src0->f[3] - src1->f[3];
1033}
1034
1035static void
1036fetch_src_file_channel(const struct tgsi_exec_machine *mach,
1037                       const uint file,
1038                       const uint swizzle,
1039                       const union tgsi_exec_channel *index,
1040                       const union tgsi_exec_channel *index2D,
1041                       union tgsi_exec_channel *chan)
1042{
1043   uint i;
1044
1045   assert(swizzle < 4);
1046
1047   switch (file) {
1048   case TGSI_FILE_CONSTANT:
1049      for (i = 0; i < QUAD_SIZE; i++) {
1050         assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
1051         assert(mach->Consts[index2D->i[i]]);
1052
1053         if (index->i[i] < 0) {
1054            chan->u[i] = 0;
1055         } else {
1056            /* NOTE: copying the const value as a uint instead of float */
1057            const uint constbuf = index2D->i[i];
1058            const uint *buf = (const uint *)mach->Consts[constbuf];
1059            const int pos = index->i[i] * 4 + swizzle;
1060            /* const buffer bounds check */
1061            if (pos < 0 || pos >= mach->ConstsSize[constbuf]) {
1062               if (0) {
1063                  /* Debug: print warning */
1064                  static int count = 0;
1065                  if (count++ < 100)
1066                     debug_printf("TGSI Exec: const buffer index %d"
1067                                  " out of bounds\n", pos);
1068               }
1069               chan->u[i] = 0;
1070            }
1071            else
1072               chan->u[i] = buf[pos];
1073         }
1074      }
1075      break;
1076
1077   case TGSI_FILE_INPUT:
1078      for (i = 0; i < QUAD_SIZE; i++) {
1079         /*
1080         if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1081            debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
1082                         index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
1083                         index2D->i[i], index->i[i]);
1084                         }*/
1085         int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
1086         assert(pos >= 0);
1087         assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
1088         chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
1089      }
1090      break;
1091
1092   case TGSI_FILE_SYSTEM_VALUE:
1093      /* XXX no swizzling at this point.  Will be needed if we put
1094       * gl_FragCoord, for example, in a sys value register.
1095       */
1096      for (i = 0; i < QUAD_SIZE; i++) {
1097         chan->f[i] = mach->SystemValue[index->i[i]][0];
1098      }
1099      break;
1100
1101   case TGSI_FILE_TEMPORARY:
1102      for (i = 0; i < QUAD_SIZE; i++) {
1103         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1104         assert(index2D->i[i] == 0);
1105
1106         chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1107      }
1108      break;
1109
1110   case TGSI_FILE_TEMPORARY_ARRAY:
1111      for (i = 0; i < QUAD_SIZE; i++) {
1112         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1113         assert(index2D->i[i] < TGSI_EXEC_NUM_TEMP_ARRAYS);
1114
1115         chan->u[i] =
1116            mach->TempArray[index2D->i[i]][index->i[i]].xyzw[swizzle].u[i];
1117      }
1118      break;
1119
1120   case TGSI_FILE_IMMEDIATE:
1121      for (i = 0; i < QUAD_SIZE; i++) {
1122         assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1123         assert(index2D->i[i] == 0);
1124
1125         chan->f[i] = mach->Imms[index->i[i]][swizzle];
1126      }
1127      break;
1128
1129   case TGSI_FILE_IMMEDIATE_ARRAY:
1130      for (i = 0; i < QUAD_SIZE; i++) {
1131         assert(index2D->i[i] == 0);
1132
1133         chan->f[i] = mach->ImmArray[index->i[i]][swizzle];
1134      }
1135      break;
1136
1137   case TGSI_FILE_ADDRESS:
1138      for (i = 0; i < QUAD_SIZE; i++) {
1139         assert(index->i[i] >= 0);
1140         assert(index2D->i[i] == 0);
1141
1142         chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1143      }
1144      break;
1145
1146   case TGSI_FILE_PREDICATE:
1147      for (i = 0; i < QUAD_SIZE; i++) {
1148         assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
1149         assert(index2D->i[i] == 0);
1150
1151         chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
1152      }
1153      break;
1154
1155   case TGSI_FILE_OUTPUT:
1156      /* vertex/fragment output vars can be read too */
1157      for (i = 0; i < QUAD_SIZE; i++) {
1158         assert(index->i[i] >= 0);
1159         assert(index2D->i[i] == 0);
1160
1161         chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1162      }
1163      break;
1164
1165   default:
1166      assert(0);
1167      for (i = 0; i < QUAD_SIZE; i++) {
1168         chan->u[i] = 0;
1169      }
1170   }
1171}
1172
1173static void
1174fetch_source(const struct tgsi_exec_machine *mach,
1175             union tgsi_exec_channel *chan,
1176             const struct tgsi_full_src_register *reg,
1177             const uint chan_index,
1178             enum tgsi_exec_datatype src_datatype)
1179{
1180   union tgsi_exec_channel index;
1181   union tgsi_exec_channel index2D;
1182   uint swizzle;
1183
1184   /* We start with a direct index into a register file.
1185    *
1186    *    file[1],
1187    *    where:
1188    *       file = Register.File
1189    *       [1] = Register.Index
1190    */
1191   index.i[0] =
1192   index.i[1] =
1193   index.i[2] =
1194   index.i[3] = reg->Register.Index;
1195
1196   /* There is an extra source register that indirectly subscripts
1197    * a register file. The direct index now becomes an offset
1198    * that is being added to the indirect register.
1199    *
1200    *    file[ind[2].x+1],
1201    *    where:
1202    *       ind = Indirect.File
1203    *       [2] = Indirect.Index
1204    *       .x = Indirect.SwizzleX
1205    */
1206   if (reg->Register.Indirect) {
1207      union tgsi_exec_channel index2;
1208      union tgsi_exec_channel indir_index;
1209      const uint execmask = mach->ExecMask;
1210      uint i;
1211
1212      /* which address register (always zero now) */
1213      index2.i[0] =
1214      index2.i[1] =
1215      index2.i[2] =
1216      index2.i[3] = reg->Indirect.Index;
1217      assert(reg->Indirect.File == TGSI_FILE_ADDRESS);
1218      /* get current value of address register[swizzle] */
1219      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1220      fetch_src_file_channel(mach,
1221                             reg->Indirect.File,
1222                             swizzle,
1223                             &index2,
1224                             &ZeroVec,
1225                             &indir_index);
1226
1227      /* add value of address register to the offset */
1228      index.i[0] += indir_index.i[0];
1229      index.i[1] += indir_index.i[1];
1230      index.i[2] += indir_index.i[2];
1231      index.i[3] += indir_index.i[3];
1232
1233      /* for disabled execution channels, zero-out the index to
1234       * avoid using a potential garbage value.
1235       */
1236      for (i = 0; i < QUAD_SIZE; i++) {
1237         if ((execmask & (1 << i)) == 0)
1238            index.i[i] = 0;
1239      }
1240   }
1241
1242   /* There is an extra source register that is a second
1243    * subscript to a register file. Effectively it means that
1244    * the register file is actually a 2D array of registers.
1245    *
1246    *    file[3][1],
1247    *    where:
1248    *       [3] = Dimension.Index
1249    */
1250   if (reg->Register.Dimension) {
1251      index2D.i[0] =
1252      index2D.i[1] =
1253      index2D.i[2] =
1254      index2D.i[3] = reg->Dimension.Index;
1255
1256      /* Again, the second subscript index can be addressed indirectly
1257       * identically to the first one.
1258       * Nothing stops us from indirectly addressing the indirect register,
1259       * but there is no need for that, so we won't exercise it.
1260       *
1261       *    file[ind[4].y+3][1],
1262       *    where:
1263       *       ind = DimIndirect.File
1264       *       [4] = DimIndirect.Index
1265       *       .y = DimIndirect.SwizzleX
1266       */
1267      if (reg->Dimension.Indirect) {
1268         union tgsi_exec_channel index2;
1269         union tgsi_exec_channel indir_index;
1270         const uint execmask = mach->ExecMask;
1271         uint i;
1272
1273         index2.i[0] =
1274         index2.i[1] =
1275         index2.i[2] =
1276         index2.i[3] = reg->DimIndirect.Index;
1277
1278         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1279         fetch_src_file_channel(mach,
1280                                reg->DimIndirect.File,
1281                                swizzle,
1282                                &index2,
1283                                &ZeroVec,
1284                                &indir_index);
1285
1286         index2D.i[0] += indir_index.i[0];
1287         index2D.i[1] += indir_index.i[1];
1288         index2D.i[2] += indir_index.i[2];
1289         index2D.i[3] += indir_index.i[3];
1290
1291         /* for disabled execution channels, zero-out the index to
1292          * avoid using a potential garbage value.
1293          */
1294         for (i = 0; i < QUAD_SIZE; i++) {
1295            if ((execmask & (1 << i)) == 0) {
1296               index2D.i[i] = 0;
1297            }
1298         }
1299      }
1300
1301      /* If by any chance there was a need for a 3D array of register
1302       * files, we would have to check whether Dimension is followed
1303       * by a dimension register and continue the saga.
1304       */
1305   } else {
1306      index2D.i[0] =
1307      index2D.i[1] =
1308      index2D.i[2] =
1309      index2D.i[3] = 0;
1310   }
1311
1312   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1313   fetch_src_file_channel(mach,
1314                          reg->Register.File,
1315                          swizzle,
1316                          &index,
1317                          &index2D,
1318                          chan);
1319
1320   if (reg->Register.Absolute) {
1321      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1322         micro_abs(chan, chan);
1323      } else {
1324         micro_iabs(chan, chan);
1325      }
1326   }
1327
1328   if (reg->Register.Negate) {
1329      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1330         micro_neg(chan, chan);
1331      } else {
1332         micro_ineg(chan, chan);
1333      }
1334   }
1335}
1336
1337static void
1338store_dest(struct tgsi_exec_machine *mach,
1339           const union tgsi_exec_channel *chan,
1340           const struct tgsi_full_dst_register *reg,
1341           const struct tgsi_full_instruction *inst,
1342           uint chan_index,
1343           enum tgsi_exec_datatype dst_datatype)
1344{
1345   uint i;
1346   union tgsi_exec_channel null;
1347   union tgsi_exec_channel *dst;
1348   union tgsi_exec_channel index2D;
1349   uint execmask = mach->ExecMask;
1350   int offset = 0;  /* indirection offset */
1351   int index;
1352
1353   /* for debugging */
1354   if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1355      check_inf_or_nan(chan);
1356   }
1357
1358   /* There is an extra source register that indirectly subscripts
1359    * a register file. The direct index now becomes an offset
1360    * that is being added to the indirect register.
1361    *
1362    *    file[ind[2].x+1],
1363    *    where:
1364    *       ind = Indirect.File
1365    *       [2] = Indirect.Index
1366    *       .x = Indirect.SwizzleX
1367    */
1368   if (reg->Register.Indirect) {
1369      union tgsi_exec_channel index;
1370      union tgsi_exec_channel indir_index;
1371      uint swizzle;
1372
1373      /* which address register (always zero for now) */
1374      index.i[0] =
1375      index.i[1] =
1376      index.i[2] =
1377      index.i[3] = reg->Indirect.Index;
1378
1379      /* get current value of address register[swizzle] */
1380      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1381
1382      /* fetch values from the address/indirection register */
1383      fetch_src_file_channel(mach,
1384                             reg->Indirect.File,
1385                             swizzle,
1386                             &index,
1387                             &ZeroVec,
1388                             &indir_index);
1389
1390      /* save indirection offset */
1391      offset = indir_index.i[0];
1392   }
1393
1394   /* There is an extra source register that is a second
1395    * subscript to a register file. Effectively it means that
1396    * the register file is actually a 2D array of registers.
1397    *
1398    *    file[3][1],
1399    *    where:
1400    *       [3] = Dimension.Index
1401    */
1402   if (reg->Register.Dimension) {
1403      index2D.i[0] =
1404      index2D.i[1] =
1405      index2D.i[2] =
1406      index2D.i[3] = reg->Dimension.Index;
1407
1408      /* Again, the second subscript index can be addressed indirectly
1409       * identically to the first one.
1410       * Nothing stops us from indirectly addressing the indirect register,
1411       * but there is no need for that, so we won't exercise it.
1412       *
1413       *    file[ind[4].y+3][1],
1414       *    where:
1415       *       ind = DimIndirect.File
1416       *       [4] = DimIndirect.Index
1417       *       .y = DimIndirect.SwizzleX
1418       */
1419      if (reg->Dimension.Indirect) {
1420         union tgsi_exec_channel index2;
1421         union tgsi_exec_channel indir_index;
1422         const uint execmask = mach->ExecMask;
1423         unsigned swizzle;
1424         uint i;
1425
1426         index2.i[0] =
1427         index2.i[1] =
1428         index2.i[2] =
1429         index2.i[3] = reg->DimIndirect.Index;
1430
1431         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1432         fetch_src_file_channel(mach,
1433                                reg->DimIndirect.File,
1434                                swizzle,
1435                                &index2,
1436                                &ZeroVec,
1437                                &indir_index);
1438
1439         index2D.i[0] += indir_index.i[0];
1440         index2D.i[1] += indir_index.i[1];
1441         index2D.i[2] += indir_index.i[2];
1442         index2D.i[3] += indir_index.i[3];
1443
1444         /* for disabled execution channels, zero-out the index to
1445          * avoid using a potential garbage value.
1446          */
1447         for (i = 0; i < QUAD_SIZE; i++) {
1448            if ((execmask & (1 << i)) == 0) {
1449               index2D.i[i] = 0;
1450            }
1451         }
1452      }
1453
1454      /* If by any chance there was a need for a 3D array of register
1455       * files, we would have to check whether Dimension is followed
1456       * by a dimension register and continue the saga.
1457       */
1458   } else {
1459      index2D.i[0] =
1460      index2D.i[1] =
1461      index2D.i[2] =
1462      index2D.i[3] = 0;
1463   }
1464
1465   switch (reg->Register.File) {
1466   case TGSI_FILE_NULL:
1467      dst = &null;
1468      break;
1469
1470   case TGSI_FILE_OUTPUT:
1471      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1472         + reg->Register.Index;
1473      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1474#if 0
1475      if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1476         fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1477         for (i = 0; i < QUAD_SIZE; i++)
1478            if (execmask & (1 << i))
1479               fprintf(stderr, "%f, ", chan->f[i]);
1480         fprintf(stderr, ")\n");
1481      }
1482#endif
1483      break;
1484
1485   case TGSI_FILE_TEMPORARY:
1486      index = reg->Register.Index;
1487      assert( index < TGSI_EXEC_NUM_TEMPS );
1488      dst = &mach->Temps[offset + index].xyzw[chan_index];
1489      break;
1490
1491   case TGSI_FILE_TEMPORARY_ARRAY:
1492      index = reg->Register.Index;
1493      assert( index < TGSI_EXEC_NUM_TEMPS );
1494      assert( index2D.i[0] < TGSI_EXEC_NUM_TEMP_ARRAYS );
1495      /* XXX we use index2D.i[0] here but somehow we might
1496       * end up with someone trying to store indirectly in
1497       * different buffers */
1498      dst = &mach->TempArray[index2D.i[0]][offset + index].xyzw[chan_index];
1499      break;
1500
1501   case TGSI_FILE_ADDRESS:
1502      index = reg->Register.Index;
1503      dst = &mach->Addrs[index].xyzw[chan_index];
1504      break;
1505
1506   case TGSI_FILE_PREDICATE:
1507      index = reg->Register.Index;
1508      assert(index < TGSI_EXEC_NUM_PREDS);
1509      dst = &mach->Predicates[index].xyzw[chan_index];
1510      break;
1511
1512   default:
1513      assert( 0 );
1514      return;
1515   }
1516
1517   if (inst->Instruction.Predicate) {
1518      uint swizzle;
1519      union tgsi_exec_channel *pred;
1520
1521      switch (chan_index) {
1522      case CHAN_X:
1523         swizzle = inst->Predicate.SwizzleX;
1524         break;
1525      case CHAN_Y:
1526         swizzle = inst->Predicate.SwizzleY;
1527         break;
1528      case CHAN_Z:
1529         swizzle = inst->Predicate.SwizzleZ;
1530         break;
1531      case CHAN_W:
1532         swizzle = inst->Predicate.SwizzleW;
1533         break;
1534      default:
1535         assert(0);
1536         return;
1537      }
1538
1539      assert(inst->Predicate.Index == 0);
1540
1541      pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1542
1543      if (inst->Predicate.Negate) {
1544         for (i = 0; i < QUAD_SIZE; i++) {
1545            if (pred->u[i]) {
1546               execmask &= ~(1 << i);
1547            }
1548         }
1549      } else {
1550         for (i = 0; i < QUAD_SIZE; i++) {
1551            if (!pred->u[i]) {
1552               execmask &= ~(1 << i);
1553            }
1554         }
1555      }
1556   }
1557
1558   switch (inst->Instruction.Saturate) {
1559   case TGSI_SAT_NONE:
1560      for (i = 0; i < QUAD_SIZE; i++)
1561         if (execmask & (1 << i))
1562            dst->i[i] = chan->i[i];
1563      break;
1564
1565   case TGSI_SAT_ZERO_ONE:
1566      for (i = 0; i < QUAD_SIZE; i++)
1567         if (execmask & (1 << i)) {
1568            if (chan->f[i] < 0.0f)
1569               dst->f[i] = 0.0f;
1570            else if (chan->f[i] > 1.0f)
1571               dst->f[i] = 1.0f;
1572            else
1573               dst->i[i] = chan->i[i];
1574         }
1575      break;
1576
1577   case TGSI_SAT_MINUS_PLUS_ONE:
1578      for (i = 0; i < QUAD_SIZE; i++)
1579         if (execmask & (1 << i)) {
1580            if (chan->f[i] < -1.0f)
1581               dst->f[i] = -1.0f;
1582            else if (chan->f[i] > 1.0f)
1583               dst->f[i] = 1.0f;
1584            else
1585               dst->i[i] = chan->i[i];
1586         }
1587      break;
1588
1589   default:
1590      assert( 0 );
1591   }
1592}
1593
1594#define FETCH(VAL,INDEX,CHAN)\
1595    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1596
1597#define IFETCH(VAL,INDEX,CHAN)\
1598    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_INT)
1599
1600
1601/**
1602 * Execute ARB-style KIL which is predicated by a src register.
1603 * Kill fragment if any of the four values is less than zero.
1604 */
1605static void
1606exec_kil(struct tgsi_exec_machine *mach,
1607         const struct tgsi_full_instruction *inst)
1608{
1609   uint uniquemask;
1610   uint chan_index;
1611   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1612   union tgsi_exec_channel r[1];
1613
1614   /* This mask stores component bits that were already tested. */
1615   uniquemask = 0;
1616
1617   for (chan_index = 0; chan_index < 4; chan_index++)
1618   {
1619      uint swizzle;
1620      uint i;
1621
1622      /* unswizzle channel */
1623      swizzle = tgsi_util_get_full_src_register_swizzle (
1624                        &inst->Src[0],
1625                        chan_index);
1626
1627      /* check if the component has not been already tested */
1628      if (uniquemask & (1 << swizzle))
1629         continue;
1630      uniquemask |= 1 << swizzle;
1631
1632      FETCH(&r[0], 0, chan_index);
1633      for (i = 0; i < 4; i++)
1634         if (r[0].f[i] < 0.0f)
1635            kilmask |= 1 << i;
1636   }
1637
1638   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1639}
1640
1641/**
1642 * Execute NVIDIA-style KIL which is predicated by a condition code.
1643 * Kill fragment if the condition code is TRUE.
1644 */
1645static void
1646exec_kilp(struct tgsi_exec_machine *mach,
1647          const struct tgsi_full_instruction *inst)
1648{
1649   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1650
1651   /* "unconditional" kil */
1652   kilmask = mach->ExecMask;
1653   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1654}
1655
1656static void
1657emit_vertex(struct tgsi_exec_machine *mach)
1658{
1659   /* FIXME: check for exec mask correctly
1660   unsigned i;
1661   for (i = 0; i < QUAD_SIZE; ++i) {
1662         if ((mach->ExecMask & (1 << i)))
1663   */
1664   if (mach->ExecMask) {
1665      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1666      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1667   }
1668}
1669
1670static void
1671emit_primitive(struct tgsi_exec_machine *mach)
1672{
1673   unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1674   /* FIXME: check for exec mask correctly
1675   unsigned i;
1676   for (i = 0; i < QUAD_SIZE; ++i) {
1677         if ((mach->ExecMask & (1 << i)))
1678   */
1679   if (mach->ExecMask) {
1680      ++(*prim_count);
1681      debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1682      mach->Primitives[*prim_count] = 0;
1683   }
1684}
1685
1686static void
1687conditional_emit_primitive(struct tgsi_exec_machine *mach)
1688{
1689   if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1690      int emitted_verts =
1691         mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]];
1692      if (emitted_verts) {
1693         emit_primitive(mach);
1694      }
1695   }
1696}
1697
1698
1699/*
1700 * Fetch four texture samples using STR texture coordinates.
1701 */
1702static void
1703fetch_texel( struct tgsi_sampler *sampler,
1704             const union tgsi_exec_channel *s,
1705             const union tgsi_exec_channel *t,
1706             const union tgsi_exec_channel *p,
1707             const union tgsi_exec_channel *c0,
1708             enum tgsi_sampler_control control,
1709             union tgsi_exec_channel *r,
1710             union tgsi_exec_channel *g,
1711             union tgsi_exec_channel *b,
1712             union tgsi_exec_channel *a )
1713{
1714   uint j;
1715   float rgba[NUM_CHANNELS][QUAD_SIZE];
1716
1717   sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
1718
1719   for (j = 0; j < 4; j++) {
1720      r->f[j] = rgba[0][j];
1721      g->f[j] = rgba[1][j];
1722      b->f[j] = rgba[2][j];
1723      a->f[j] = rgba[3][j];
1724   }
1725}
1726
1727
1728#define TEX_MODIFIER_NONE           0
1729#define TEX_MODIFIER_PROJECTED      1
1730#define TEX_MODIFIER_LOD_BIAS       2
1731#define TEX_MODIFIER_EXPLICIT_LOD   3
1732
1733
1734static void
1735exec_tex(struct tgsi_exec_machine *mach,
1736         const struct tgsi_full_instruction *inst,
1737         uint modifier)
1738{
1739   const uint unit = inst->Src[1].Register.Index;
1740   union tgsi_exec_channel r[4];
1741   const union tgsi_exec_channel *lod = &ZeroVec;
1742   enum tgsi_sampler_control control;
1743   uint chan;
1744
1745   if (modifier != TEX_MODIFIER_NONE) {
1746      FETCH(&r[3], 0, CHAN_W);
1747      if (modifier != TEX_MODIFIER_PROJECTED) {
1748         lod = &r[3];
1749      }
1750   }
1751
1752   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
1753      control = tgsi_sampler_lod_explicit;
1754   } else {
1755      control = tgsi_sampler_lod_bias;
1756   }
1757
1758   switch (inst->Texture.Texture) {
1759   case TGSI_TEXTURE_1D:
1760   case TGSI_TEXTURE_SHADOW1D:
1761      FETCH(&r[0], 0, CHAN_X);
1762
1763      if (modifier == TEX_MODIFIER_PROJECTED) {
1764         micro_div(&r[0], &r[0], &r[3]);
1765      }
1766
1767      fetch_texel(mach->Samplers[unit],
1768                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
1769                  control,
1770                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1771      break;
1772
1773   case TGSI_TEXTURE_2D:
1774   case TGSI_TEXTURE_RECT:
1775   case TGSI_TEXTURE_SHADOW2D:
1776   case TGSI_TEXTURE_SHADOWRECT:
1777      FETCH(&r[0], 0, CHAN_X);
1778      FETCH(&r[1], 0, CHAN_Y);
1779      FETCH(&r[2], 0, CHAN_Z);
1780
1781      if (modifier == TEX_MODIFIER_PROJECTED) {
1782         micro_div(&r[0], &r[0], &r[3]);
1783         micro_div(&r[1], &r[1], &r[3]);
1784         micro_div(&r[2], &r[2], &r[3]);
1785      }
1786
1787      fetch_texel(mach->Samplers[unit],
1788                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1789                  control,
1790                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1791      break;
1792
1793   case TGSI_TEXTURE_1D_ARRAY:
1794      FETCH(&r[0], 0, CHAN_X);
1795      FETCH(&r[1], 0, CHAN_Y);
1796
1797      if (modifier == TEX_MODIFIER_PROJECTED) {
1798         micro_div(&r[0], &r[0], &r[3]);
1799      }
1800
1801      fetch_texel(mach->Samplers[unit],
1802                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1803                  control,
1804                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1805      break;
1806
1807   case TGSI_TEXTURE_2D_ARRAY:
1808      FETCH(&r[0], 0, CHAN_X);
1809      FETCH(&r[1], 0, CHAN_Y);
1810      FETCH(&r[2], 0, CHAN_Z);
1811
1812      if (modifier == TEX_MODIFIER_PROJECTED) {
1813         micro_div(&r[0], &r[0], &r[3]);
1814         micro_div(&r[1], &r[1], &r[3]);
1815      }
1816
1817      fetch_texel(mach->Samplers[unit],
1818                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1819                  control,
1820                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1821      break;
1822
1823   case TGSI_TEXTURE_3D:
1824   case TGSI_TEXTURE_CUBE:
1825      FETCH(&r[0], 0, CHAN_X);
1826      FETCH(&r[1], 0, CHAN_Y);
1827      FETCH(&r[2], 0, CHAN_Z);
1828
1829      if (modifier == TEX_MODIFIER_PROJECTED) {
1830         micro_div(&r[0], &r[0], &r[3]);
1831         micro_div(&r[1], &r[1], &r[3]);
1832         micro_div(&r[2], &r[2], &r[3]);
1833      }
1834
1835      fetch_texel(mach->Samplers[unit],
1836                  &r[0], &r[1], &r[2], lod,
1837                  control,
1838                  &r[0], &r[1], &r[2], &r[3]);
1839      break;
1840
1841   default:
1842      assert(0);
1843   }
1844
1845#if 0
1846   debug_printf("fetch r: %g %g %g %g\n",
1847         r[0].f[0], r[0].f[1], r[0].f[2], r[0].f[3]);
1848   debug_printf("fetch g: %g %g %g %g\n",
1849         r[1].f[0], r[1].f[1], r[1].f[2], r[1].f[3]);
1850   debug_printf("fetch b: %g %g %g %g\n",
1851         r[2].f[0], r[2].f[1], r[2].f[2], r[2].f[3]);
1852   debug_printf("fetch a: %g %g %g %g\n",
1853         r[3].f[0], r[3].f[1], r[3].f[2], r[3].f[3]);
1854#endif
1855
1856   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1857      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1858         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1859      }
1860   }
1861}
1862
1863static void
1864exec_txd(struct tgsi_exec_machine *mach,
1865         const struct tgsi_full_instruction *inst)
1866{
1867   const uint unit = inst->Src[3].Register.Index;
1868   union tgsi_exec_channel r[4];
1869   uint chan;
1870
1871   /*
1872    * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1873    */
1874
1875   switch (inst->Texture.Texture) {
1876   case TGSI_TEXTURE_1D:
1877   case TGSI_TEXTURE_SHADOW1D:
1878
1879      FETCH(&r[0], 0, CHAN_X);
1880
1881      fetch_texel(mach->Samplers[unit],
1882                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
1883                  tgsi_sampler_lod_bias,
1884                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
1885      break;
1886
1887   case TGSI_TEXTURE_2D:
1888   case TGSI_TEXTURE_RECT:
1889   case TGSI_TEXTURE_SHADOW2D:
1890   case TGSI_TEXTURE_SHADOWRECT:
1891
1892      FETCH(&r[0], 0, CHAN_X);
1893      FETCH(&r[1], 0, CHAN_Y);
1894      FETCH(&r[2], 0, CHAN_Z);
1895
1896      fetch_texel(mach->Samplers[unit],
1897                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
1898                  tgsi_sampler_lod_bias,
1899                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
1900      break;
1901
1902   case TGSI_TEXTURE_3D:
1903   case TGSI_TEXTURE_CUBE:
1904
1905      FETCH(&r[0], 0, CHAN_X);
1906      FETCH(&r[1], 0, CHAN_Y);
1907      FETCH(&r[2], 0, CHAN_Z);
1908
1909      fetch_texel(mach->Samplers[unit],
1910                  &r[0], &r[1], &r[2], &ZeroVec,
1911                  tgsi_sampler_lod_bias,
1912                  &r[0], &r[1], &r[2], &r[3]);
1913      break;
1914
1915   default:
1916      assert(0);
1917   }
1918
1919   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1920      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1921         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1922      }
1923   }
1924}
1925
1926
1927static void
1928exec_txf(struct tgsi_exec_machine *mach,
1929	 const struct tgsi_full_instruction *inst)
1930{
1931   struct tgsi_sampler *sampler;
1932   const uint unit = inst->Src[1].Register.Index;
1933   union tgsi_exec_channel r[4];
1934   uint chan;
1935   float rgba[NUM_CHANNELS][QUAD_SIZE];
1936   int j;
1937
1938   IFETCH(&r[3], 0, CHAN_W);
1939
1940   switch(inst->Texture.Texture) {
1941   case TGSI_TEXTURE_3D:
1942   case TGSI_TEXTURE_2D_ARRAY:
1943      IFETCH(&r[2], 0, CHAN_Z);
1944      /* fallthrough */
1945   case TGSI_TEXTURE_2D:
1946   case TGSI_TEXTURE_RECT:
1947   case TGSI_TEXTURE_SHADOW2D:
1948   case TGSI_TEXTURE_SHADOWRECT:
1949   case TGSI_TEXTURE_1D_ARRAY:
1950      IFETCH(&r[1], 0, CHAN_Y);
1951      /* fallthrough */
1952   case TGSI_TEXTURE_1D:
1953   case TGSI_TEXTURE_SHADOW1D:
1954      IFETCH(&r[0], 0, CHAN_X);
1955      break;
1956   default:
1957      assert(0);
1958      break;
1959   }
1960
1961   sampler = mach->Samplers[unit];
1962   sampler->get_texel(sampler, r[0].i, r[1].i, r[2].i, r[3].i, rgba);
1963
1964   for (j = 0; j < QUAD_SIZE; j++) {
1965      r[0].f[j] = rgba[0][j];
1966      r[1].f[j] = rgba[1][j];
1967      r[2].f[j] = rgba[2][j];
1968      r[3].f[j] = rgba[3][j];
1969   }
1970
1971   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1972      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1973         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1974      }
1975   }
1976}
1977
1978static void
1979exec_txq(struct tgsi_exec_machine *mach,
1980         const struct tgsi_full_instruction *inst)
1981{
1982   struct tgsi_sampler *sampler;
1983   const uint unit = inst->Src[1].Register.Index;
1984   int result[4];
1985   union tgsi_exec_channel r[4], src;
1986   uint chan;
1987   int i,j;
1988
1989   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_INT);
1990   sampler = mach->Samplers[unit];
1991
1992   sampler->get_dims(sampler, src.i[0], result);
1993
1994   for (i = 0; i < QUAD_SIZE; i++) {
1995      for (j = 0; j < 4; j++) {
1996	 r[j].i[i] = result[j];
1997      }
1998   }
1999
2000   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2001      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2002	 store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
2003		    TGSI_EXEC_DATA_INT);
2004      }
2005   }
2006}
2007
2008static void
2009exec_sample(struct tgsi_exec_machine *mach,
2010            const struct tgsi_full_instruction *inst,
2011            uint modifier)
2012{
2013   const uint resource_unit = inst->Src[1].Register.Index;
2014   const uint sampler_unit = inst->Src[2].Register.Index;
2015   union tgsi_exec_channel r[4];
2016   const union tgsi_exec_channel *lod = &ZeroVec;
2017   enum tgsi_sampler_control control;
2018   uint chan;
2019
2020   if (modifier != TEX_MODIFIER_NONE) {
2021      if (modifier == TEX_MODIFIER_LOD_BIAS)
2022         FETCH(&r[3], 3, CHAN_X);
2023      else /*TEX_MODIFIER_LOD*/
2024         FETCH(&r[3], 0, CHAN_W);
2025
2026      if (modifier != TEX_MODIFIER_PROJECTED) {
2027         lod = &r[3];
2028      }
2029   }
2030
2031   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
2032      control = tgsi_sampler_lod_explicit;
2033   } else {
2034      control = tgsi_sampler_lod_bias;
2035   }
2036
2037   switch (mach->Resources[resource_unit].Resource) {
2038   case TGSI_TEXTURE_1D:
2039   case TGSI_TEXTURE_SHADOW1D:
2040      FETCH(&r[0], 0, CHAN_X);
2041
2042      if (modifier == TEX_MODIFIER_PROJECTED) {
2043         micro_div(&r[0], &r[0], &r[3]);
2044      }
2045
2046      fetch_texel(mach->Samplers[sampler_unit],
2047                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
2048                  control,
2049                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2050      break;
2051
2052   case TGSI_TEXTURE_2D:
2053   case TGSI_TEXTURE_RECT:
2054   case TGSI_TEXTURE_SHADOW2D:
2055   case TGSI_TEXTURE_SHADOWRECT:
2056      FETCH(&r[0], 0, CHAN_X);
2057      FETCH(&r[1], 0, CHAN_Y);
2058      FETCH(&r[2], 0, CHAN_Z);
2059
2060      if (modifier == TEX_MODIFIER_PROJECTED) {
2061         micro_div(&r[0], &r[0], &r[3]);
2062         micro_div(&r[1], &r[1], &r[3]);
2063         micro_div(&r[2], &r[2], &r[3]);
2064      }
2065
2066      fetch_texel(mach->Samplers[sampler_unit],
2067                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
2068                  control,
2069                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2070      break;
2071
2072   case TGSI_TEXTURE_3D:
2073   case TGSI_TEXTURE_CUBE:
2074      FETCH(&r[0], 0, CHAN_X);
2075      FETCH(&r[1], 0, CHAN_Y);
2076      FETCH(&r[2], 0, CHAN_Z);
2077
2078      if (modifier == TEX_MODIFIER_PROJECTED) {
2079         micro_div(&r[0], &r[0], &r[3]);
2080         micro_div(&r[1], &r[1], &r[3]);
2081         micro_div(&r[2], &r[2], &r[3]);
2082      }
2083
2084      fetch_texel(mach->Samplers[sampler_unit],
2085                  &r[0], &r[1], &r[2], lod,
2086                  control,
2087                  &r[0], &r[1], &r[2], &r[3]);
2088      break;
2089
2090   default:
2091      assert(0);
2092   }
2093
2094   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2095      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2096         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2097      }
2098   }
2099}
2100
2101static void
2102exec_sample_d(struct tgsi_exec_machine *mach,
2103              const struct tgsi_full_instruction *inst)
2104{
2105   const uint resource_unit = inst->Src[1].Register.Index;
2106   const uint sampler_unit = inst->Src[2].Register.Index;
2107   union tgsi_exec_channel r[4];
2108   uint chan;
2109   /*
2110    * XXX: This is fake SAMPLE_D -- the derivatives are not taken into account, yet.
2111    */
2112
2113   switch (mach->Resources[resource_unit].Resource) {
2114   case TGSI_TEXTURE_1D:
2115   case TGSI_TEXTURE_SHADOW1D:
2116
2117      FETCH(&r[0], 0, CHAN_X);
2118
2119      fetch_texel(mach->Samplers[sampler_unit],
2120                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
2121                  tgsi_sampler_lod_bias,
2122                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2123      break;
2124
2125   case TGSI_TEXTURE_2D:
2126   case TGSI_TEXTURE_RECT:
2127   case TGSI_TEXTURE_SHADOW2D:
2128   case TGSI_TEXTURE_SHADOWRECT:
2129
2130      FETCH(&r[0], 0, CHAN_X);
2131      FETCH(&r[1], 0, CHAN_Y);
2132      FETCH(&r[2], 0, CHAN_Z);
2133
2134      fetch_texel(mach->Samplers[sampler_unit],
2135                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
2136                  tgsi_sampler_lod_bias,
2137                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2138      break;
2139
2140   case TGSI_TEXTURE_3D:
2141   case TGSI_TEXTURE_CUBE:
2142
2143      FETCH(&r[0], 0, CHAN_X);
2144      FETCH(&r[1], 0, CHAN_Y);
2145      FETCH(&r[2], 0, CHAN_Z);
2146
2147      fetch_texel(mach->Samplers[sampler_unit],
2148                  &r[0], &r[1], &r[2], &ZeroVec,
2149                  tgsi_sampler_lod_bias,
2150                  &r[0], &r[1], &r[2], &r[3]);
2151      break;
2152
2153   default:
2154      assert(0);
2155   }
2156
2157   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2158      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2159         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2160      }
2161   }
2162}
2163
2164
2165/**
2166 * Evaluate a constant-valued coefficient at the position of the
2167 * current quad.
2168 */
2169static void
2170eval_constant_coef(
2171   struct tgsi_exec_machine *mach,
2172   unsigned attrib,
2173   unsigned chan )
2174{
2175   unsigned i;
2176
2177   for( i = 0; i < QUAD_SIZE; i++ ) {
2178      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
2179   }
2180}
2181
2182/**
2183 * Evaluate a linear-valued coefficient at the position of the
2184 * current quad.
2185 */
2186static void
2187eval_linear_coef(
2188   struct tgsi_exec_machine *mach,
2189   unsigned attrib,
2190   unsigned chan )
2191{
2192   const float x = mach->QuadPos.xyzw[0].f[0];
2193   const float y = mach->QuadPos.xyzw[1].f[0];
2194   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2195   const float dady = mach->InterpCoefs[attrib].dady[chan];
2196   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2197   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
2198   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
2199   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
2200   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
2201}
2202
2203/**
2204 * Evaluate a perspective-valued coefficient at the position of the
2205 * current quad.
2206 */
2207static void
2208eval_perspective_coef(
2209   struct tgsi_exec_machine *mach,
2210   unsigned attrib,
2211   unsigned chan )
2212{
2213   const float x = mach->QuadPos.xyzw[0].f[0];
2214   const float y = mach->QuadPos.xyzw[1].f[0];
2215   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2216   const float dady = mach->InterpCoefs[attrib].dady[chan];
2217   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2218   const float *w = mach->QuadPos.xyzw[3].f;
2219   /* divide by W here */
2220   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
2221   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
2222   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
2223   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
2224}
2225
2226
2227typedef void (* eval_coef_func)(
2228   struct tgsi_exec_machine *mach,
2229   unsigned attrib,
2230   unsigned chan );
2231
2232static void
2233exec_declaration(struct tgsi_exec_machine *mach,
2234                 const struct tgsi_full_declaration *decl)
2235{
2236   if (decl->Declaration.File == TGSI_FILE_RESOURCE) {
2237      mach->Resources[decl->Range.First] = decl->Resource;
2238      return;
2239   }
2240
2241   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
2242      if (decl->Declaration.File == TGSI_FILE_INPUT) {
2243         uint first, last, mask;
2244
2245         first = decl->Range.First;
2246         last = decl->Range.Last;
2247         mask = decl->Declaration.UsageMask;
2248
2249         /* XXX we could remove this special-case code since
2250          * mach->InterpCoefs[first].a0 should already have the
2251          * front/back-face value.  But we should first update the
2252          * ureg code to emit the right UsageMask value (WRITEMASK_X).
2253          * Then, we could remove the tgsi_exec_machine::Face field.
2254          */
2255         /* XXX make FACE a system value */
2256         if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
2257            uint i;
2258
2259            assert(decl->Semantic.Index == 0);
2260            assert(first == last);
2261
2262            for (i = 0; i < QUAD_SIZE; i++) {
2263               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
2264            }
2265         } else {
2266            eval_coef_func eval;
2267            uint i, j;
2268
2269            switch (decl->Declaration.Interpolate) {
2270            case TGSI_INTERPOLATE_CONSTANT:
2271               eval = eval_constant_coef;
2272               break;
2273
2274            case TGSI_INTERPOLATE_LINEAR:
2275               eval = eval_linear_coef;
2276               break;
2277
2278            case TGSI_INTERPOLATE_PERSPECTIVE:
2279               eval = eval_perspective_coef;
2280               break;
2281
2282            default:
2283               assert(0);
2284               return;
2285            }
2286
2287            for (j = 0; j < NUM_CHANNELS; j++) {
2288               if (mask & (1 << j)) {
2289                  for (i = first; i <= last; i++) {
2290                     eval(mach, i, j);
2291                  }
2292               }
2293            }
2294         }
2295      }
2296   }
2297
2298   if (decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
2299      mach->SysSemanticToIndex[decl->Declaration.Semantic] = decl->Range.First;
2300   }
2301}
2302
2303
2304typedef void (* micro_op)(union tgsi_exec_channel *dst);
2305
2306static void
2307exec_vector(struct tgsi_exec_machine *mach,
2308            const struct tgsi_full_instruction *inst,
2309            micro_op op,
2310            enum tgsi_exec_datatype dst_datatype)
2311{
2312   unsigned int chan;
2313
2314   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2315      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2316         union tgsi_exec_channel dst;
2317
2318         op(&dst);
2319         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2320      }
2321   }
2322}
2323
2324typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
2325                                const union tgsi_exec_channel *src);
2326
2327static void
2328exec_scalar_unary(struct tgsi_exec_machine *mach,
2329                  const struct tgsi_full_instruction *inst,
2330                  micro_unary_op op,
2331                  enum tgsi_exec_datatype dst_datatype,
2332                  enum tgsi_exec_datatype src_datatype)
2333{
2334   unsigned int chan;
2335   union tgsi_exec_channel src;
2336   union tgsi_exec_channel dst;
2337
2338   fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
2339   op(&dst, &src);
2340   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2341      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2342         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2343      }
2344   }
2345}
2346
2347static void
2348exec_vector_unary(struct tgsi_exec_machine *mach,
2349                  const struct tgsi_full_instruction *inst,
2350                  micro_unary_op op,
2351                  enum tgsi_exec_datatype dst_datatype,
2352                  enum tgsi_exec_datatype src_datatype)
2353{
2354   unsigned int chan;
2355   struct tgsi_exec_vector dst;
2356
2357   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2358      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2359         union tgsi_exec_channel src;
2360
2361         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
2362         op(&dst.xyzw[chan], &src);
2363      }
2364   }
2365   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2366      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2367         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2368      }
2369   }
2370}
2371
2372typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
2373                                 const union tgsi_exec_channel *src0,
2374                                 const union tgsi_exec_channel *src1);
2375
2376static void
2377exec_scalar_binary(struct tgsi_exec_machine *mach,
2378                   const struct tgsi_full_instruction *inst,
2379                   micro_binary_op op,
2380                   enum tgsi_exec_datatype dst_datatype,
2381                   enum tgsi_exec_datatype src_datatype)
2382{
2383   unsigned int chan;
2384   union tgsi_exec_channel src[2];
2385   union tgsi_exec_channel dst;
2386
2387   fetch_source(mach, &src[0], &inst->Src[0], CHAN_X, src_datatype);
2388   fetch_source(mach, &src[1], &inst->Src[1], CHAN_Y, src_datatype);
2389   op(&dst, &src[0], &src[1]);
2390   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2391      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2392         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2393      }
2394   }
2395}
2396
2397static void
2398exec_vector_binary(struct tgsi_exec_machine *mach,
2399                   const struct tgsi_full_instruction *inst,
2400                   micro_binary_op op,
2401                   enum tgsi_exec_datatype dst_datatype,
2402                   enum tgsi_exec_datatype src_datatype)
2403{
2404   unsigned int chan;
2405   struct tgsi_exec_vector dst;
2406
2407   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2408      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2409         union tgsi_exec_channel src[2];
2410
2411         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2412         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2413         op(&dst.xyzw[chan], &src[0], &src[1]);
2414      }
2415   }
2416   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2417      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2418         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2419      }
2420   }
2421}
2422
2423typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
2424                                  const union tgsi_exec_channel *src0,
2425                                  const union tgsi_exec_channel *src1,
2426                                  const union tgsi_exec_channel *src2);
2427
2428static void
2429exec_vector_trinary(struct tgsi_exec_machine *mach,
2430                    const struct tgsi_full_instruction *inst,
2431                    micro_trinary_op op,
2432                    enum tgsi_exec_datatype dst_datatype,
2433                    enum tgsi_exec_datatype src_datatype)
2434{
2435   unsigned int chan;
2436   struct tgsi_exec_vector dst;
2437
2438   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2439      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2440         union tgsi_exec_channel src[3];
2441
2442         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2443         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2444         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
2445         op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
2446      }
2447   }
2448   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2449      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2450         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2451      }
2452   }
2453}
2454
2455static void
2456exec_dp3(struct tgsi_exec_machine *mach,
2457         const struct tgsi_full_instruction *inst)
2458{
2459   unsigned int chan;
2460   union tgsi_exec_channel arg[3];
2461
2462   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2463   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2464   micro_mul(&arg[2], &arg[0], &arg[1]);
2465
2466   for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2467      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2468      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2469      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2470   }
2471
2472   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2473      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2474         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2475      }
2476   }
2477}
2478
2479static void
2480exec_dp4(struct tgsi_exec_machine *mach,
2481         const struct tgsi_full_instruction *inst)
2482{
2483   unsigned int chan;
2484   union tgsi_exec_channel arg[3];
2485
2486   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2487   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2488   micro_mul(&arg[2], &arg[0], &arg[1]);
2489
2490   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2491      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2492      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2493      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2494   }
2495
2496   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2497      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2498         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2499      }
2500   }
2501}
2502
2503static void
2504exec_dp2a(struct tgsi_exec_machine *mach,
2505          const struct tgsi_full_instruction *inst)
2506{
2507   unsigned int chan;
2508   union tgsi_exec_channel arg[3];
2509
2510   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2511   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2512   micro_mul(&arg[2], &arg[0], &arg[1]);
2513
2514   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2515   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2516   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2517
2518   fetch_source(mach, &arg[1], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2519   micro_add(&arg[0], &arg[0], &arg[1]);
2520
2521   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2522      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2523         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2524      }
2525   }
2526}
2527
2528static void
2529exec_dph(struct tgsi_exec_machine *mach,
2530         const struct tgsi_full_instruction *inst)
2531{
2532   unsigned int chan;
2533   union tgsi_exec_channel arg[3];
2534
2535   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2536   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2537   micro_mul(&arg[2], &arg[0], &arg[1]);
2538
2539   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2540   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2541   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2542
2543   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2544   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2545   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2546
2547   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2548   micro_add(&arg[0], &arg[0], &arg[1]);
2549
2550   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2551      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2552         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2553      }
2554   }
2555}
2556
2557static void
2558exec_dp2(struct tgsi_exec_machine *mach,
2559         const struct tgsi_full_instruction *inst)
2560{
2561   unsigned int chan;
2562   union tgsi_exec_channel arg[3];
2563
2564   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2565   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2566   micro_mul(&arg[2], &arg[0], &arg[1]);
2567
2568   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2569   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2570   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2571
2572   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2573      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2574         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2575      }
2576   }
2577}
2578
2579static void
2580exec_nrm4(struct tgsi_exec_machine *mach,
2581          const struct tgsi_full_instruction *inst)
2582{
2583   unsigned int chan;
2584   union tgsi_exec_channel arg[4];
2585   union tgsi_exec_channel scale;
2586
2587   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2588   micro_mul(&scale, &arg[0], &arg[0]);
2589
2590   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2591      union tgsi_exec_channel product;
2592
2593      fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2594      micro_mul(&product, &arg[chan], &arg[chan]);
2595      micro_add(&scale, &scale, &product);
2596   }
2597
2598   micro_rsq(&scale, &scale);
2599
2600   for (chan = CHAN_X; chan <= CHAN_W; chan++) {
2601      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2602         micro_mul(&arg[chan], &arg[chan], &scale);
2603         store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2604      }
2605   }
2606}
2607
2608static void
2609exec_nrm3(struct tgsi_exec_machine *mach,
2610          const struct tgsi_full_instruction *inst)
2611{
2612   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2613      unsigned int chan;
2614      union tgsi_exec_channel arg[3];
2615      union tgsi_exec_channel scale;
2616
2617      fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2618      micro_mul(&scale, &arg[0], &arg[0]);
2619
2620      for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2621         union tgsi_exec_channel product;
2622
2623         fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2624         micro_mul(&product, &arg[chan], &arg[chan]);
2625         micro_add(&scale, &scale, &product);
2626      }
2627
2628      micro_rsq(&scale, &scale);
2629
2630      for (chan = CHAN_X; chan <= CHAN_Z; chan++) {
2631         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2632            micro_mul(&arg[chan], &arg[chan], &scale);
2633            store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2634         }
2635      }
2636   }
2637
2638   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2639      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2640   }
2641}
2642
2643static void
2644exec_scs(struct tgsi_exec_machine *mach,
2645         const struct tgsi_full_instruction *inst)
2646{
2647   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) {
2648      union tgsi_exec_channel arg;
2649      union tgsi_exec_channel result;
2650
2651      fetch_source(mach, &arg, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2652
2653      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2654         micro_cos(&result, &arg);
2655         store_dest(mach, &result, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2656      }
2657      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2658         micro_sin(&result, &arg);
2659         store_dest(mach, &result, &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2660      }
2661   }
2662   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2663      store_dest(mach, &ZeroVec, &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2664   }
2665   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2666      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2667   }
2668}
2669
2670static void
2671exec_x2d(struct tgsi_exec_machine *mach,
2672         const struct tgsi_full_instruction *inst)
2673{
2674   union tgsi_exec_channel r[4];
2675   union tgsi_exec_channel d[2];
2676
2677   fetch_source(mach, &r[0], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2678   fetch_source(mach, &r[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2679   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XZ) {
2680      fetch_source(mach, &r[2], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2681      micro_mul(&r[2], &r[2], &r[0]);
2682      fetch_source(mach, &r[3], &inst->Src[2], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2683      micro_mul(&r[3], &r[3], &r[1]);
2684      micro_add(&r[2], &r[2], &r[3]);
2685      fetch_source(mach, &r[3], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2686      micro_add(&d[0], &r[2], &r[3]);
2687   }
2688   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YW) {
2689      fetch_source(mach, &r[2], &inst->Src[2], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2690      micro_mul(&r[2], &r[2], &r[0]);
2691      fetch_source(mach, &r[3], &inst->Src[2], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2692      micro_mul(&r[3], &r[3], &r[1]);
2693      micro_add(&r[2], &r[2], &r[3]);
2694      fetch_source(mach, &r[3], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2695      micro_add(&d[1], &r[2], &r[3]);
2696   }
2697   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2698      store_dest(mach, &d[0], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2699   }
2700   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2701      store_dest(mach, &d[1], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2702   }
2703   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2704      store_dest(mach, &d[0], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2705   }
2706   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2707      store_dest(mach, &d[1], &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2708   }
2709}
2710
2711static void
2712exec_rfl(struct tgsi_exec_machine *mach,
2713         const struct tgsi_full_instruction *inst)
2714{
2715   union tgsi_exec_channel r[9];
2716
2717   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2718      /* r0 = dp3(src0, src0) */
2719      fetch_source(mach, &r[2], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2720      micro_mul(&r[0], &r[2], &r[2]);
2721      fetch_source(mach, &r[4], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2722      micro_mul(&r[8], &r[4], &r[4]);
2723      micro_add(&r[0], &r[0], &r[8]);
2724      fetch_source(mach, &r[6], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2725      micro_mul(&r[8], &r[6], &r[6]);
2726      micro_add(&r[0], &r[0], &r[8]);
2727
2728      /* r1 = dp3(src0, src1) */
2729      fetch_source(mach, &r[3], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2730      micro_mul(&r[1], &r[2], &r[3]);
2731      fetch_source(mach, &r[5], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2732      micro_mul(&r[8], &r[4], &r[5]);
2733      micro_add(&r[1], &r[1], &r[8]);
2734      fetch_source(mach, &r[7], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2735      micro_mul(&r[8], &r[6], &r[7]);
2736      micro_add(&r[1], &r[1], &r[8]);
2737
2738      /* r1 = 2 * r1 / r0 */
2739      micro_add(&r[1], &r[1], &r[1]);
2740      micro_div(&r[1], &r[1], &r[0]);
2741
2742      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2743         micro_mul(&r[2], &r[2], &r[1]);
2744         micro_sub(&r[2], &r[2], &r[3]);
2745         store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2746      }
2747      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2748         micro_mul(&r[4], &r[4], &r[1]);
2749         micro_sub(&r[4], &r[4], &r[5]);
2750         store_dest(mach, &r[4], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2751      }
2752      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2753         micro_mul(&r[6], &r[6], &r[1]);
2754         micro_sub(&r[6], &r[6], &r[7]);
2755         store_dest(mach, &r[6], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2756      }
2757   }
2758   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2759      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2760   }
2761}
2762
2763static void
2764exec_xpd(struct tgsi_exec_machine *mach,
2765         const struct tgsi_full_instruction *inst)
2766{
2767   union tgsi_exec_channel r[6];
2768   union tgsi_exec_channel d[3];
2769
2770   fetch_source(mach, &r[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2771   fetch_source(mach, &r[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2772
2773   micro_mul(&r[2], &r[0], &r[1]);
2774
2775   fetch_source(mach, &r[3], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2776   fetch_source(mach, &r[4], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2777
2778   micro_mul(&r[5], &r[3], &r[4] );
2779   micro_sub(&d[CHAN_X], &r[2], &r[5]);
2780
2781   fetch_source(mach, &r[2], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2782
2783   micro_mul(&r[3], &r[3], &r[2]);
2784
2785   fetch_source(mach, &r[5], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2786
2787   micro_mul(&r[1], &r[1], &r[5]);
2788   micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2789
2790   micro_mul(&r[5], &r[5], &r[4]);
2791   micro_mul(&r[0], &r[0], &r[2]);
2792   micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2793
2794   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2795      store_dest(mach, &d[CHAN_X], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2796   }
2797   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2798      store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2799   }
2800   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2801      store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2802   }
2803   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2804      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2805   }
2806}
2807
2808static void
2809exec_dst(struct tgsi_exec_machine *mach,
2810         const struct tgsi_full_instruction *inst)
2811{
2812   union tgsi_exec_channel r[2];
2813   union tgsi_exec_channel d[4];
2814
2815   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2816      fetch_source(mach, &r[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2817      fetch_source(mach, &r[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2818      micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2819   }
2820   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2821      fetch_source(mach, &d[CHAN_Z], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2822   }
2823   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2824      fetch_source(mach, &d[CHAN_W], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2825   }
2826
2827   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2828      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2829   }
2830   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2831      store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2832   }
2833   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2834      store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2835   }
2836   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2837      store_dest(mach, &d[CHAN_W], &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2838   }
2839}
2840
2841static void
2842exec_log(struct tgsi_exec_machine *mach,
2843         const struct tgsi_full_instruction *inst)
2844{
2845   union tgsi_exec_channel r[3];
2846
2847   fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2848   micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
2849   micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
2850   micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
2851   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2852      store_dest(mach, &r[0], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2853   }
2854   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2855      micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
2856      micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
2857      store_dest(mach, &r[0], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2858   }
2859   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2860      store_dest(mach, &r[1], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2861   }
2862   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2863      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2864   }
2865}
2866
2867static void
2868exec_exp(struct tgsi_exec_machine *mach,
2869         const struct tgsi_full_instruction *inst)
2870{
2871   union tgsi_exec_channel r[3];
2872
2873   fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2874   micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
2875   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2876      micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
2877      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2878   }
2879   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2880      micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
2881      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2882   }
2883   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2884      micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
2885      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2886   }
2887   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2888      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2889   }
2890}
2891
2892static void
2893exec_lit(struct tgsi_exec_machine *mach,
2894         const struct tgsi_full_instruction *inst)
2895{
2896   union tgsi_exec_channel r[3];
2897   union tgsi_exec_channel d[3];
2898
2899   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2900      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2901   }
2902   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
2903      fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2904      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2905         micro_max(&d[CHAN_Y], &r[0], &ZeroVec);
2906         store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2907      }
2908
2909      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2910         fetch_source(mach, &r[1], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2911         micro_max(&r[1], &r[1], &ZeroVec);
2912
2913         fetch_source(mach, &r[2], &inst->Src[0], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2914         micro_min(&r[2], &r[2], &P128Vec);
2915         micro_max(&r[2], &r[2], &M128Vec);
2916         micro_pow(&r[1], &r[1], &r[2]);
2917         micro_lt(&d[CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
2918         store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2919      }
2920   }
2921   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2922      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2923   }
2924}
2925
2926static void
2927exec_break(struct tgsi_exec_machine *mach)
2928{
2929   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
2930      /* turn off loop channels for each enabled exec channel */
2931      mach->LoopMask &= ~mach->ExecMask;
2932      /* Todo: if mach->LoopMask == 0, jump to end of loop */
2933      UPDATE_EXEC_MASK(mach);
2934   } else {
2935      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
2936
2937      mach->Switch.mask = 0x0;
2938
2939      UPDATE_EXEC_MASK(mach);
2940   }
2941}
2942
2943static void
2944exec_switch(struct tgsi_exec_machine *mach,
2945            const struct tgsi_full_instruction *inst)
2946{
2947   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
2948   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
2949
2950   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
2951   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2952   mach->Switch.mask = 0x0;
2953   mach->Switch.defaultMask = 0x0;
2954
2955   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
2956   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
2957
2958   UPDATE_EXEC_MASK(mach);
2959}
2960
2961static void
2962exec_case(struct tgsi_exec_machine *mach,
2963          const struct tgsi_full_instruction *inst)
2964{
2965   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2966   union tgsi_exec_channel src;
2967   uint mask = 0;
2968
2969   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2970
2971   if (mach->Switch.selector.u[0] == src.u[0]) {
2972      mask |= 0x1;
2973   }
2974   if (mach->Switch.selector.u[1] == src.u[1]) {
2975      mask |= 0x2;
2976   }
2977   if (mach->Switch.selector.u[2] == src.u[2]) {
2978      mask |= 0x4;
2979   }
2980   if (mach->Switch.selector.u[3] == src.u[3]) {
2981      mask |= 0x8;
2982   }
2983
2984   mach->Switch.defaultMask |= mask;
2985
2986   mach->Switch.mask |= mask & prevMask;
2987
2988   UPDATE_EXEC_MASK(mach);
2989}
2990
2991static void
2992exec_default(struct tgsi_exec_machine *mach)
2993{
2994   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2995
2996   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
2997
2998   UPDATE_EXEC_MASK(mach);
2999}
3000
3001static void
3002exec_endswitch(struct tgsi_exec_machine *mach)
3003{
3004   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
3005   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3006
3007   UPDATE_EXEC_MASK(mach);
3008}
3009
3010static void
3011micro_i2f(union tgsi_exec_channel *dst,
3012          const union tgsi_exec_channel *src)
3013{
3014   dst->f[0] = (float)src->i[0];
3015   dst->f[1] = (float)src->i[1];
3016   dst->f[2] = (float)src->i[2];
3017   dst->f[3] = (float)src->i[3];
3018}
3019
3020static void
3021micro_not(union tgsi_exec_channel *dst,
3022          const union tgsi_exec_channel *src)
3023{
3024   dst->u[0] = ~src->u[0];
3025   dst->u[1] = ~src->u[1];
3026   dst->u[2] = ~src->u[2];
3027   dst->u[3] = ~src->u[3];
3028}
3029
3030static void
3031micro_shl(union tgsi_exec_channel *dst,
3032          const union tgsi_exec_channel *src0,
3033          const union tgsi_exec_channel *src1)
3034{
3035   dst->u[0] = src0->u[0] << src1->u[0];
3036   dst->u[1] = src0->u[1] << src1->u[1];
3037   dst->u[2] = src0->u[2] << src1->u[2];
3038   dst->u[3] = src0->u[3] << src1->u[3];
3039}
3040
3041static void
3042micro_and(union tgsi_exec_channel *dst,
3043          const union tgsi_exec_channel *src0,
3044          const union tgsi_exec_channel *src1)
3045{
3046   dst->u[0] = src0->u[0] & src1->u[0];
3047   dst->u[1] = src0->u[1] & src1->u[1];
3048   dst->u[2] = src0->u[2] & src1->u[2];
3049   dst->u[3] = src0->u[3] & src1->u[3];
3050}
3051
3052static void
3053micro_or(union tgsi_exec_channel *dst,
3054         const union tgsi_exec_channel *src0,
3055         const union tgsi_exec_channel *src1)
3056{
3057   dst->u[0] = src0->u[0] | src1->u[0];
3058   dst->u[1] = src0->u[1] | src1->u[1];
3059   dst->u[2] = src0->u[2] | src1->u[2];
3060   dst->u[3] = src0->u[3] | src1->u[3];
3061}
3062
3063static void
3064micro_xor(union tgsi_exec_channel *dst,
3065          const union tgsi_exec_channel *src0,
3066          const union tgsi_exec_channel *src1)
3067{
3068   dst->u[0] = src0->u[0] ^ src1->u[0];
3069   dst->u[1] = src0->u[1] ^ src1->u[1];
3070   dst->u[2] = src0->u[2] ^ src1->u[2];
3071   dst->u[3] = src0->u[3] ^ src1->u[3];
3072}
3073
3074static void
3075micro_mod(union tgsi_exec_channel *dst,
3076          const union tgsi_exec_channel *src0,
3077          const union tgsi_exec_channel *src1)
3078{
3079   dst->i[0] = src0->i[0] % src1->i[0];
3080   dst->i[1] = src0->i[1] % src1->i[1];
3081   dst->i[2] = src0->i[2] % src1->i[2];
3082   dst->i[3] = src0->i[3] % src1->i[3];
3083}
3084
3085static void
3086micro_f2i(union tgsi_exec_channel *dst,
3087          const union tgsi_exec_channel *src)
3088{
3089   dst->i[0] = (int)src->f[0];
3090   dst->i[1] = (int)src->f[1];
3091   dst->i[2] = (int)src->f[2];
3092   dst->i[3] = (int)src->f[3];
3093}
3094
3095static void
3096micro_idiv(union tgsi_exec_channel *dst,
3097           const union tgsi_exec_channel *src0,
3098           const union tgsi_exec_channel *src1)
3099{
3100   dst->i[0] = src0->i[0] / src1->i[0];
3101   dst->i[1] = src0->i[1] / src1->i[1];
3102   dst->i[2] = src0->i[2] / src1->i[2];
3103   dst->i[3] = src0->i[3] / src1->i[3];
3104}
3105
3106static void
3107micro_imax(union tgsi_exec_channel *dst,
3108           const union tgsi_exec_channel *src0,
3109           const union tgsi_exec_channel *src1)
3110{
3111   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
3112   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
3113   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
3114   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
3115}
3116
3117static void
3118micro_imin(union tgsi_exec_channel *dst,
3119           const union tgsi_exec_channel *src0,
3120           const union tgsi_exec_channel *src1)
3121{
3122   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
3123   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
3124   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
3125   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
3126}
3127
3128static void
3129micro_isge(union tgsi_exec_channel *dst,
3130           const union tgsi_exec_channel *src0,
3131           const union tgsi_exec_channel *src1)
3132{
3133   dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
3134   dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
3135   dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
3136   dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
3137}
3138
3139static void
3140micro_ishr(union tgsi_exec_channel *dst,
3141           const union tgsi_exec_channel *src0,
3142           const union tgsi_exec_channel *src1)
3143{
3144   dst->i[0] = src0->i[0] >> src1->i[0];
3145   dst->i[1] = src0->i[1] >> src1->i[1];
3146   dst->i[2] = src0->i[2] >> src1->i[2];
3147   dst->i[3] = src0->i[3] >> src1->i[3];
3148}
3149
3150static void
3151micro_islt(union tgsi_exec_channel *dst,
3152           const union tgsi_exec_channel *src0,
3153           const union tgsi_exec_channel *src1)
3154{
3155   dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
3156   dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
3157   dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
3158   dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
3159}
3160
3161static void
3162micro_f2u(union tgsi_exec_channel *dst,
3163          const union tgsi_exec_channel *src)
3164{
3165   dst->u[0] = (uint)src->f[0];
3166   dst->u[1] = (uint)src->f[1];
3167   dst->u[2] = (uint)src->f[2];
3168   dst->u[3] = (uint)src->f[3];
3169}
3170
3171static void
3172micro_u2f(union tgsi_exec_channel *dst,
3173          const union tgsi_exec_channel *src)
3174{
3175   dst->f[0] = (float)src->u[0];
3176   dst->f[1] = (float)src->u[1];
3177   dst->f[2] = (float)src->u[2];
3178   dst->f[3] = (float)src->u[3];
3179}
3180
3181static void
3182micro_uadd(union tgsi_exec_channel *dst,
3183           const union tgsi_exec_channel *src0,
3184           const union tgsi_exec_channel *src1)
3185{
3186   dst->u[0] = src0->u[0] + src1->u[0];
3187   dst->u[1] = src0->u[1] + src1->u[1];
3188   dst->u[2] = src0->u[2] + src1->u[2];
3189   dst->u[3] = src0->u[3] + src1->u[3];
3190}
3191
3192static void
3193micro_udiv(union tgsi_exec_channel *dst,
3194           const union tgsi_exec_channel *src0,
3195           const union tgsi_exec_channel *src1)
3196{
3197   dst->u[0] = src0->u[0] / src1->u[0];
3198   dst->u[1] = src0->u[1] / src1->u[1];
3199   dst->u[2] = src0->u[2] / src1->u[2];
3200   dst->u[3] = src0->u[3] / src1->u[3];
3201}
3202
3203static void
3204micro_umad(union tgsi_exec_channel *dst,
3205           const union tgsi_exec_channel *src0,
3206           const union tgsi_exec_channel *src1,
3207           const union tgsi_exec_channel *src2)
3208{
3209   dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
3210   dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
3211   dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
3212   dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
3213}
3214
3215static void
3216micro_umax(union tgsi_exec_channel *dst,
3217           const union tgsi_exec_channel *src0,
3218           const union tgsi_exec_channel *src1)
3219{
3220   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
3221   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
3222   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
3223   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
3224}
3225
3226static void
3227micro_umin(union tgsi_exec_channel *dst,
3228           const union tgsi_exec_channel *src0,
3229           const union tgsi_exec_channel *src1)
3230{
3231   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
3232   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
3233   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
3234   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
3235}
3236
3237static void
3238micro_umod(union tgsi_exec_channel *dst,
3239           const union tgsi_exec_channel *src0,
3240           const union tgsi_exec_channel *src1)
3241{
3242   dst->u[0] = src0->u[0] % src1->u[0];
3243   dst->u[1] = src0->u[1] % src1->u[1];
3244   dst->u[2] = src0->u[2] % src1->u[2];
3245   dst->u[3] = src0->u[3] % src1->u[3];
3246}
3247
3248static void
3249micro_umul(union tgsi_exec_channel *dst,
3250           const union tgsi_exec_channel *src0,
3251           const union tgsi_exec_channel *src1)
3252{
3253   dst->u[0] = src0->u[0] * src1->u[0];
3254   dst->u[1] = src0->u[1] * src1->u[1];
3255   dst->u[2] = src0->u[2] * src1->u[2];
3256   dst->u[3] = src0->u[3] * src1->u[3];
3257}
3258
3259static void
3260micro_useq(union tgsi_exec_channel *dst,
3261           const union tgsi_exec_channel *src0,
3262           const union tgsi_exec_channel *src1)
3263{
3264   dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
3265   dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
3266   dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
3267   dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
3268}
3269
3270static void
3271micro_usge(union tgsi_exec_channel *dst,
3272           const union tgsi_exec_channel *src0,
3273           const union tgsi_exec_channel *src1)
3274{
3275   dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
3276   dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
3277   dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
3278   dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
3279}
3280
3281static void
3282micro_ushr(union tgsi_exec_channel *dst,
3283           const union tgsi_exec_channel *src0,
3284           const union tgsi_exec_channel *src1)
3285{
3286   dst->u[0] = src0->u[0] >> src1->u[0];
3287   dst->u[1] = src0->u[1] >> src1->u[1];
3288   dst->u[2] = src0->u[2] >> src1->u[2];
3289   dst->u[3] = src0->u[3] >> src1->u[3];
3290}
3291
3292static void
3293micro_uslt(union tgsi_exec_channel *dst,
3294           const union tgsi_exec_channel *src0,
3295           const union tgsi_exec_channel *src1)
3296{
3297   dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
3298   dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
3299   dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
3300   dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
3301}
3302
3303static void
3304micro_usne(union tgsi_exec_channel *dst,
3305           const union tgsi_exec_channel *src0,
3306           const union tgsi_exec_channel *src1)
3307{
3308   dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
3309   dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
3310   dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
3311   dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
3312}
3313
3314static void
3315exec_instruction(
3316   struct tgsi_exec_machine *mach,
3317   const struct tgsi_full_instruction *inst,
3318   int *pc )
3319{
3320   union tgsi_exec_channel r[10];
3321
3322   (*pc)++;
3323
3324   switch (inst->Instruction.Opcode) {
3325   case TGSI_OPCODE_ARL:
3326      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3327      break;
3328
3329   case TGSI_OPCODE_MOV:
3330      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3331      break;
3332
3333   case TGSI_OPCODE_LIT:
3334      exec_lit(mach, inst);
3335      break;
3336
3337   case TGSI_OPCODE_RCP:
3338      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3339      break;
3340
3341   case TGSI_OPCODE_RSQ:
3342      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3343      break;
3344
3345   case TGSI_OPCODE_EXP:
3346      exec_exp(mach, inst);
3347      break;
3348
3349   case TGSI_OPCODE_LOG:
3350      exec_log(mach, inst);
3351      break;
3352
3353   case TGSI_OPCODE_MUL:
3354      exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3355      break;
3356
3357   case TGSI_OPCODE_ADD:
3358      exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3359      break;
3360
3361   case TGSI_OPCODE_DP3:
3362      exec_dp3(mach, inst);
3363      break;
3364
3365   case TGSI_OPCODE_DP4:
3366      exec_dp4(mach, inst);
3367      break;
3368
3369   case TGSI_OPCODE_DST:
3370      exec_dst(mach, inst);
3371      break;
3372
3373   case TGSI_OPCODE_MIN:
3374      exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3375      break;
3376
3377   case TGSI_OPCODE_MAX:
3378      exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3379      break;
3380
3381   case TGSI_OPCODE_SLT:
3382      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3383      break;
3384
3385   case TGSI_OPCODE_SGE:
3386      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3387      break;
3388
3389   case TGSI_OPCODE_MAD:
3390      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3391      break;
3392
3393   case TGSI_OPCODE_SUB:
3394      exec_vector_binary(mach, inst, micro_sub, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3395      break;
3396
3397   case TGSI_OPCODE_LRP:
3398      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3399      break;
3400
3401   case TGSI_OPCODE_CND:
3402      exec_vector_trinary(mach, inst, micro_cnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3403      break;
3404
3405   case TGSI_OPCODE_DP2A:
3406      exec_dp2a(mach, inst);
3407      break;
3408
3409   case TGSI_OPCODE_FRC:
3410      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3411      break;
3412
3413   case TGSI_OPCODE_CLAMP:
3414      exec_vector_trinary(mach, inst, micro_clamp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3415      break;
3416
3417   case TGSI_OPCODE_FLR:
3418      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3419      break;
3420
3421   case TGSI_OPCODE_ROUND:
3422      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3423      break;
3424
3425   case TGSI_OPCODE_EX2:
3426      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3427      break;
3428
3429   case TGSI_OPCODE_LG2:
3430      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3431      break;
3432
3433   case TGSI_OPCODE_POW:
3434      exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3435      break;
3436
3437   case TGSI_OPCODE_XPD:
3438      exec_xpd(mach, inst);
3439      break;
3440
3441   case TGSI_OPCODE_ABS:
3442      exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3443      break;
3444
3445   case TGSI_OPCODE_RCC:
3446      exec_scalar_unary(mach, inst, micro_rcc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3447      break;
3448
3449   case TGSI_OPCODE_DPH:
3450      exec_dph(mach, inst);
3451      break;
3452
3453   case TGSI_OPCODE_COS:
3454      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3455      break;
3456
3457   case TGSI_OPCODE_DDX:
3458      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3459      break;
3460
3461   case TGSI_OPCODE_DDY:
3462      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3463      break;
3464
3465   case TGSI_OPCODE_KILP:
3466      exec_kilp (mach, inst);
3467      break;
3468
3469   case TGSI_OPCODE_KIL:
3470      exec_kil (mach, inst);
3471      break;
3472
3473   case TGSI_OPCODE_PK2H:
3474      assert (0);
3475      break;
3476
3477   case TGSI_OPCODE_PK2US:
3478      assert (0);
3479      break;
3480
3481   case TGSI_OPCODE_PK4B:
3482      assert (0);
3483      break;
3484
3485   case TGSI_OPCODE_PK4UB:
3486      assert (0);
3487      break;
3488
3489   case TGSI_OPCODE_RFL:
3490      exec_rfl(mach, inst);
3491      break;
3492
3493   case TGSI_OPCODE_SEQ:
3494      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3495      break;
3496
3497   case TGSI_OPCODE_SFL:
3498      exec_vector(mach, inst, micro_sfl, TGSI_EXEC_DATA_FLOAT);
3499      break;
3500
3501   case TGSI_OPCODE_SGT:
3502      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3503      break;
3504
3505   case TGSI_OPCODE_SIN:
3506      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3507      break;
3508
3509   case TGSI_OPCODE_SLE:
3510      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3511      break;
3512
3513   case TGSI_OPCODE_SNE:
3514      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3515      break;
3516
3517   case TGSI_OPCODE_STR:
3518      exec_vector(mach, inst, micro_str, TGSI_EXEC_DATA_FLOAT);
3519      break;
3520
3521   case TGSI_OPCODE_TEX:
3522      /* simple texture lookup */
3523      /* src[0] = texcoord */
3524      /* src[1] = sampler unit */
3525      exec_tex(mach, inst, TEX_MODIFIER_NONE);
3526      break;
3527
3528   case TGSI_OPCODE_TXB:
3529      /* Texture lookup with lod bias */
3530      /* src[0] = texcoord (src[0].w = LOD bias) */
3531      /* src[1] = sampler unit */
3532      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
3533      break;
3534
3535   case TGSI_OPCODE_TXD:
3536      /* Texture lookup with explict partial derivatives */
3537      /* src[0] = texcoord */
3538      /* src[1] = d[strq]/dx */
3539      /* src[2] = d[strq]/dy */
3540      /* src[3] = sampler unit */
3541      exec_txd(mach, inst);
3542      break;
3543
3544   case TGSI_OPCODE_TXL:
3545      /* Texture lookup with explit LOD */
3546      /* src[0] = texcoord (src[0].w = LOD) */
3547      /* src[1] = sampler unit */
3548      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
3549      break;
3550
3551   case TGSI_OPCODE_TXP:
3552      /* Texture lookup with projection */
3553      /* src[0] = texcoord (src[0].w = projection) */
3554      /* src[1] = sampler unit */
3555      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
3556      break;
3557
3558   case TGSI_OPCODE_UP2H:
3559      assert (0);
3560      break;
3561
3562   case TGSI_OPCODE_UP2US:
3563      assert (0);
3564      break;
3565
3566   case TGSI_OPCODE_UP4B:
3567      assert (0);
3568      break;
3569
3570   case TGSI_OPCODE_UP4UB:
3571      assert (0);
3572      break;
3573
3574   case TGSI_OPCODE_X2D:
3575      exec_x2d(mach, inst);
3576      break;
3577
3578   case TGSI_OPCODE_ARA:
3579      assert (0);
3580      break;
3581
3582   case TGSI_OPCODE_ARR:
3583      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3584      break;
3585
3586   case TGSI_OPCODE_BRA:
3587      assert (0);
3588      break;
3589
3590   case TGSI_OPCODE_CAL:
3591      /* skip the call if no execution channels are enabled */
3592      if (mach->ExecMask) {
3593         /* do the call */
3594
3595         /* First, record the depths of the execution stacks.
3596          * This is important for deeply nested/looped return statements.
3597          * We have to unwind the stacks by the correct amount.  For a
3598          * real code generator, we could determine the number of entries
3599          * to pop off each stack with simple static analysis and avoid
3600          * implementing this data structure at run time.
3601          */
3602         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
3603         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
3604         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
3605         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
3606         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
3607         /* note that PC was already incremented above */
3608         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
3609
3610         mach->CallStackTop++;
3611
3612         /* Second, push the Cond, Loop, Cont, Func stacks */
3613         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3614         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3615         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3616         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3617         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3618         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
3619
3620         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3621         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3622         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3623         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3624         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3625         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
3626
3627         /* Finally, jump to the subroutine */
3628         *pc = inst->Label.Label;
3629      }
3630      break;
3631
3632   case TGSI_OPCODE_RET:
3633      mach->FuncMask &= ~mach->ExecMask;
3634      UPDATE_EXEC_MASK(mach);
3635
3636      if (mach->FuncMask == 0x0) {
3637         /* really return now (otherwise, keep executing */
3638
3639         if (mach->CallStackTop == 0) {
3640            /* returning from main() */
3641            mach->CondStackTop = 0;
3642            mach->LoopStackTop = 0;
3643            *pc = -1;
3644            return;
3645         }
3646
3647         assert(mach->CallStackTop > 0);
3648         mach->CallStackTop--;
3649
3650         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3651         mach->CondMask = mach->CondStack[mach->CondStackTop];
3652
3653         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3654         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3655
3656         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3657         mach->ContMask = mach->ContStack[mach->ContStackTop];
3658
3659         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3660         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3661
3662         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3663         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3664
3665         assert(mach->FuncStackTop > 0);
3666         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3667
3668         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3669
3670         UPDATE_EXEC_MASK(mach);
3671      }
3672      break;
3673
3674   case TGSI_OPCODE_SSG:
3675      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3676      break;
3677
3678   case TGSI_OPCODE_CMP:
3679      exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3680      break;
3681
3682   case TGSI_OPCODE_SCS:
3683      exec_scs(mach, inst);
3684      break;
3685
3686   case TGSI_OPCODE_NRM:
3687      exec_nrm3(mach, inst);
3688      break;
3689
3690   case TGSI_OPCODE_NRM4:
3691      exec_nrm4(mach, inst);
3692      break;
3693
3694   case TGSI_OPCODE_DIV:
3695      exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3696      break;
3697
3698   case TGSI_OPCODE_DP2:
3699      exec_dp2(mach, inst);
3700      break;
3701
3702   case TGSI_OPCODE_IF:
3703      /* push CondMask */
3704      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3705      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3706      FETCH( &r[0], 0, CHAN_X );
3707      /* update CondMask */
3708      if( ! r[0].u[0] ) {
3709         mach->CondMask &= ~0x1;
3710      }
3711      if( ! r[0].u[1] ) {
3712         mach->CondMask &= ~0x2;
3713      }
3714      if( ! r[0].u[2] ) {
3715         mach->CondMask &= ~0x4;
3716      }
3717      if( ! r[0].u[3] ) {
3718         mach->CondMask &= ~0x8;
3719      }
3720      UPDATE_EXEC_MASK(mach);
3721      /* Todo: If CondMask==0, jump to ELSE */
3722      break;
3723
3724   case TGSI_OPCODE_ELSE:
3725      /* invert CondMask wrt previous mask */
3726      {
3727         uint prevMask;
3728         assert(mach->CondStackTop > 0);
3729         prevMask = mach->CondStack[mach->CondStackTop - 1];
3730         mach->CondMask = ~mach->CondMask & prevMask;
3731         UPDATE_EXEC_MASK(mach);
3732         /* Todo: If CondMask==0, jump to ENDIF */
3733      }
3734      break;
3735
3736   case TGSI_OPCODE_ENDIF:
3737      /* pop CondMask */
3738      assert(mach->CondStackTop > 0);
3739      mach->CondMask = mach->CondStack[--mach->CondStackTop];
3740      UPDATE_EXEC_MASK(mach);
3741      break;
3742
3743   case TGSI_OPCODE_END:
3744      /* make sure we end primitives which haven't
3745       * been explicitly emitted */
3746      conditional_emit_primitive(mach);
3747      /* halt execution */
3748      *pc = -1;
3749      break;
3750
3751   case TGSI_OPCODE_PUSHA:
3752      assert (0);
3753      break;
3754
3755   case TGSI_OPCODE_POPA:
3756      assert (0);
3757      break;
3758
3759   case TGSI_OPCODE_CEIL:
3760      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3761      break;
3762
3763   case TGSI_OPCODE_I2F:
3764      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
3765      break;
3766
3767   case TGSI_OPCODE_NOT:
3768      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3769      break;
3770
3771   case TGSI_OPCODE_TRUNC:
3772      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3773      break;
3774
3775   case TGSI_OPCODE_SHL:
3776      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3777      break;
3778
3779   case TGSI_OPCODE_AND:
3780      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3781      break;
3782
3783   case TGSI_OPCODE_OR:
3784      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3785      break;
3786
3787   case TGSI_OPCODE_MOD:
3788      exec_vector_binary(mach, inst, micro_mod, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3789      break;
3790
3791   case TGSI_OPCODE_XOR:
3792      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3793      break;
3794
3795   case TGSI_OPCODE_SAD:
3796      assert (0);
3797      break;
3798
3799   case TGSI_OPCODE_TXF:
3800      exec_txf(mach, inst);
3801      break;
3802
3803   case TGSI_OPCODE_TXQ:
3804      exec_txq(mach, inst);
3805      break;
3806
3807   case TGSI_OPCODE_EMIT:
3808      emit_vertex(mach);
3809      break;
3810
3811   case TGSI_OPCODE_ENDPRIM:
3812      emit_primitive(mach);
3813      break;
3814
3815   case TGSI_OPCODE_BGNLOOP:
3816      /* push LoopMask and ContMasks */
3817      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3818      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3819      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3820      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3821
3822      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3823      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3824      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3825      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3826      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
3827      break;
3828
3829   case TGSI_OPCODE_ENDLOOP:
3830      /* Restore ContMask, but don't pop */
3831      assert(mach->ContStackTop > 0);
3832      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3833      UPDATE_EXEC_MASK(mach);
3834      if (mach->ExecMask) {
3835         /* repeat loop: jump to instruction just past BGNLOOP */
3836         assert(mach->LoopLabelStackTop > 0);
3837         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3838      }
3839      else {
3840         /* exit loop: pop LoopMask */
3841         assert(mach->LoopStackTop > 0);
3842         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3843         /* pop ContMask */
3844         assert(mach->ContStackTop > 0);
3845         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3846         assert(mach->LoopLabelStackTop > 0);
3847         --mach->LoopLabelStackTop;
3848
3849         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3850      }
3851      UPDATE_EXEC_MASK(mach);
3852      break;
3853
3854   case TGSI_OPCODE_BRK:
3855      exec_break(mach);
3856      break;
3857
3858   case TGSI_OPCODE_CONT:
3859      /* turn off cont channels for each enabled exec channel */
3860      mach->ContMask &= ~mach->ExecMask;
3861      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3862      UPDATE_EXEC_MASK(mach);
3863      break;
3864
3865   case TGSI_OPCODE_BGNSUB:
3866      /* no-op */
3867      break;
3868
3869   case TGSI_OPCODE_ENDSUB:
3870      /*
3871       * XXX: This really should be a no-op. We should never reach this opcode.
3872       */
3873
3874      assert(mach->CallStackTop > 0);
3875      mach->CallStackTop--;
3876
3877      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3878      mach->CondMask = mach->CondStack[mach->CondStackTop];
3879
3880      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3881      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3882
3883      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3884      mach->ContMask = mach->ContStack[mach->ContStackTop];
3885
3886      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3887      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3888
3889      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3890      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3891
3892      assert(mach->FuncStackTop > 0);
3893      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3894
3895      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3896
3897      UPDATE_EXEC_MASK(mach);
3898      break;
3899
3900   case TGSI_OPCODE_NOP:
3901      break;
3902
3903   case TGSI_OPCODE_BREAKC:
3904      FETCH(&r[0], 0, CHAN_X);
3905      /* update CondMask */
3906      if (r[0].u[0] && (mach->ExecMask & 0x1)) {
3907         mach->LoopMask &= ~0x1;
3908      }
3909      if (r[0].u[1] && (mach->ExecMask & 0x2)) {
3910         mach->LoopMask &= ~0x2;
3911      }
3912      if (r[0].u[2] && (mach->ExecMask & 0x4)) {
3913         mach->LoopMask &= ~0x4;
3914      }
3915      if (r[0].u[3] && (mach->ExecMask & 0x8)) {
3916         mach->LoopMask &= ~0x8;
3917      }
3918      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3919      UPDATE_EXEC_MASK(mach);
3920      break;
3921
3922   case TGSI_OPCODE_F2I:
3923      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3924      break;
3925
3926   case TGSI_OPCODE_IDIV:
3927      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3928      break;
3929
3930   case TGSI_OPCODE_IMAX:
3931      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3932      break;
3933
3934   case TGSI_OPCODE_IMIN:
3935      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3936      break;
3937
3938   case TGSI_OPCODE_INEG:
3939      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3940      break;
3941
3942   case TGSI_OPCODE_ISGE:
3943      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3944      break;
3945
3946   case TGSI_OPCODE_ISHR:
3947      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3948      break;
3949
3950   case TGSI_OPCODE_ISLT:
3951      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3952      break;
3953
3954   case TGSI_OPCODE_F2U:
3955      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3956      break;
3957
3958   case TGSI_OPCODE_U2F:
3959      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
3960      break;
3961
3962   case TGSI_OPCODE_UADD:
3963      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3964      break;
3965
3966   case TGSI_OPCODE_UDIV:
3967      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3968      break;
3969
3970   case TGSI_OPCODE_UMAD:
3971      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3972      break;
3973
3974   case TGSI_OPCODE_UMAX:
3975      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3976      break;
3977
3978   case TGSI_OPCODE_UMIN:
3979      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3980      break;
3981
3982   case TGSI_OPCODE_UMOD:
3983      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3984      break;
3985
3986   case TGSI_OPCODE_UMUL:
3987      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3988      break;
3989
3990   case TGSI_OPCODE_USEQ:
3991      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3992      break;
3993
3994   case TGSI_OPCODE_USGE:
3995      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3996      break;
3997
3998   case TGSI_OPCODE_USHR:
3999      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4000      break;
4001
4002   case TGSI_OPCODE_USLT:
4003      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4004      break;
4005
4006   case TGSI_OPCODE_USNE:
4007      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4008      break;
4009
4010   case TGSI_OPCODE_SWITCH:
4011      exec_switch(mach, inst);
4012      break;
4013
4014   case TGSI_OPCODE_CASE:
4015      exec_case(mach, inst);
4016      break;
4017
4018   case TGSI_OPCODE_DEFAULT:
4019      exec_default(mach);
4020      break;
4021
4022   case TGSI_OPCODE_ENDSWITCH:
4023      exec_endswitch(mach);
4024      break;
4025
4026   case TGSI_OPCODE_LOAD:
4027      assert(0);
4028      break;
4029
4030   case TGSI_OPCODE_LOAD_MS:
4031      assert(0);
4032      break;
4033
4034   case TGSI_OPCODE_SAMPLE:
4035      exec_sample(mach, inst, TEX_MODIFIER_NONE);
4036      break;
4037
4038   case TGSI_OPCODE_SAMPLE_B:
4039      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
4040      break;
4041
4042   case TGSI_OPCODE_SAMPLE_C:
4043      exec_sample(mach, inst, TEX_MODIFIER_NONE);
4044      break;
4045
4046   case TGSI_OPCODE_SAMPLE_C_LZ:
4047      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
4048      break;
4049
4050   case TGSI_OPCODE_SAMPLE_D:
4051      exec_sample_d(mach, inst);
4052      break;
4053
4054   case TGSI_OPCODE_SAMPLE_L:
4055      exec_sample(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
4056      break;
4057
4058   case TGSI_OPCODE_GATHER4:
4059      assert(0);
4060      break;
4061
4062   case TGSI_OPCODE_RESINFO:
4063      assert(0);
4064      break;
4065
4066   case TGSI_OPCODE_SAMPLE_POS:
4067      assert(0);
4068      break;
4069
4070   case TGSI_OPCODE_SAMPLE_INFO:
4071      assert(0);
4072      break;
4073
4074   default:
4075      assert( 0 );
4076   }
4077}
4078
4079
4080#define DEBUG_EXECUTION 0
4081
4082
4083/**
4084 * Run TGSI interpreter.
4085 * \return bitmask of "alive" quad components
4086 */
4087uint
4088tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
4089{
4090   uint i;
4091   int pc = 0;
4092
4093   mach->CondMask = 0xf;
4094   mach->LoopMask = 0xf;
4095   mach->ContMask = 0xf;
4096   mach->FuncMask = 0xf;
4097   mach->ExecMask = 0xf;
4098
4099   mach->Switch.mask = 0xf;
4100
4101   assert(mach->CondStackTop == 0);
4102   assert(mach->LoopStackTop == 0);
4103   assert(mach->ContStackTop == 0);
4104   assert(mach->SwitchStackTop == 0);
4105   assert(mach->BreakStackTop == 0);
4106   assert(mach->CallStackTop == 0);
4107
4108   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
4109   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
4110
4111   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
4112      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
4113      mach->Primitives[0] = 0;
4114   }
4115
4116   /* execute declarations (interpolants) */
4117   for (i = 0; i < mach->NumDeclarations; i++) {
4118      exec_declaration( mach, mach->Declarations+i );
4119   }
4120
4121   {
4122#if DEBUG_EXECUTION
4123      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
4124      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
4125      uint inst = 1;
4126
4127      memcpy(temps, mach->Temps, sizeof(temps));
4128      memcpy(outputs, mach->Outputs, sizeof(outputs));
4129#endif
4130
4131      /* execute instructions, until pc is set to -1 */
4132      while (pc != -1) {
4133
4134#if DEBUG_EXECUTION
4135         uint i;
4136
4137         tgsi_dump_instruction(&mach->Instructions[pc], inst++);
4138#endif
4139
4140         assert(pc < (int) mach->NumInstructions);
4141         exec_instruction(mach, mach->Instructions + pc, &pc);
4142
4143#if DEBUG_EXECUTION
4144         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
4145            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
4146               uint j;
4147
4148               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
4149               debug_printf("TEMP[%2u] = ", i);
4150               for (j = 0; j < 4; j++) {
4151                  if (j > 0) {
4152                     debug_printf("           ");
4153                  }
4154                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
4155                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
4156                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
4157                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
4158                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
4159               }
4160            }
4161         }
4162         for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
4163            if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
4164               uint j;
4165
4166               memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
4167               debug_printf("OUT[%2u] =  ", i);
4168               for (j = 0; j < 4; j++) {
4169                  if (j > 0) {
4170                     debug_printf("           ");
4171                  }
4172                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
4173                               outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
4174                               outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
4175                               outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
4176                               outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
4177               }
4178            }
4179         }
4180#endif
4181      }
4182   }
4183
4184#if 0
4185   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
4186   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
4187      /*
4188       * Scale back depth component.
4189       */
4190      for (i = 0; i < 4; i++)
4191         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
4192   }
4193#endif
4194
4195   /* Strictly speaking, these assertions aren't really needed but they
4196    * can potentially catch some bugs in the control flow code.
4197    */
4198   assert(mach->CondStackTop == 0);
4199   assert(mach->LoopStackTop == 0);
4200   assert(mach->ContStackTop == 0);
4201   assert(mach->SwitchStackTop == 0);
4202   assert(mach->BreakStackTop == 0);
4203   assert(mach->CallStackTop == 0);
4204
4205   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4206}
4207