tgsi_exec.c revision 2083a276eb270b748d1c2668eb9faa5aadc8e700
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29/**
30 * TGSI interpreter/executor.
31 *
32 * Flow control information:
33 *
34 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
35 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
36 * care since a condition may be true for some quad components but false
37 * for other components.
38 *
39 * We basically execute all statements (even if they're in the part of
40 * an IF/ELSE clause that's "not taken") and use a special mask to
41 * control writing to destination registers.  This is the ExecMask.
42 * See store_dest().
43 *
44 * The ExecMask is computed from three other masks (CondMask, LoopMask and
45 * ContMask) which are controlled by the flow control instructions (namely:
46 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
47 *
48 *
49 * Authors:
50 *   Michal Krol
51 *   Brian Paul
52 */
53
54#include "pipe/p_compiler.h"
55#include "pipe/p_state.h"
56#include "pipe/p_shader_tokens.h"
57#include "tgsi/tgsi_dump.h"
58#include "tgsi/tgsi_parse.h"
59#include "tgsi/tgsi_util.h"
60#include "tgsi_exec.h"
61#include "util/u_memory.h"
62#include "util/u_math.h"
63
64
65#define FAST_MATH 1
66
67#define TILE_TOP_LEFT     0
68#define TILE_TOP_RIGHT    1
69#define TILE_BOTTOM_LEFT  2
70#define TILE_BOTTOM_RIGHT 3
71
72static void
73micro_abs(union tgsi_exec_channel *dst,
74          const union tgsi_exec_channel *src)
75{
76   dst->f[0] = fabsf(src->f[0]);
77   dst->f[1] = fabsf(src->f[1]);
78   dst->f[2] = fabsf(src->f[2]);
79   dst->f[3] = fabsf(src->f[3]);
80}
81
82static void
83micro_arl(union tgsi_exec_channel *dst,
84          const union tgsi_exec_channel *src)
85{
86   dst->i[0] = (int)floorf(src->f[0]);
87   dst->i[1] = (int)floorf(src->f[1]);
88   dst->i[2] = (int)floorf(src->f[2]);
89   dst->i[3] = (int)floorf(src->f[3]);
90}
91
92static void
93micro_arr(union tgsi_exec_channel *dst,
94          const union tgsi_exec_channel *src)
95{
96   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
97   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
98   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
99   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
100}
101
102static void
103micro_ceil(union tgsi_exec_channel *dst,
104           const union tgsi_exec_channel *src)
105{
106   dst->f[0] = ceilf(src->f[0]);
107   dst->f[1] = ceilf(src->f[1]);
108   dst->f[2] = ceilf(src->f[2]);
109   dst->f[3] = ceilf(src->f[3]);
110}
111
112static void
113micro_clamp(union tgsi_exec_channel *dst,
114            const union tgsi_exec_channel *src0,
115            const union tgsi_exec_channel *src1,
116            const union tgsi_exec_channel *src2)
117{
118   dst->f[0] = src0->f[0] < src1->f[0] ? src1->f[0] : src0->f[0] > src2->f[0] ? src2->f[0] : src0->f[0];
119   dst->f[1] = src0->f[1] < src1->f[1] ? src1->f[1] : src0->f[1] > src2->f[1] ? src2->f[1] : src0->f[1];
120   dst->f[2] = src0->f[2] < src1->f[2] ? src1->f[2] : src0->f[2] > src2->f[2] ? src2->f[2] : src0->f[2];
121   dst->f[3] = src0->f[3] < src1->f[3] ? src1->f[3] : src0->f[3] > src2->f[3] ? src2->f[3] : src0->f[3];
122}
123
124static void
125micro_cmp(union tgsi_exec_channel *dst,
126          const union tgsi_exec_channel *src0,
127          const union tgsi_exec_channel *src1,
128          const union tgsi_exec_channel *src2)
129{
130   dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
131   dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
132   dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
133   dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
134}
135
136static void
137micro_cnd(union tgsi_exec_channel *dst,
138          const union tgsi_exec_channel *src0,
139          const union tgsi_exec_channel *src1,
140          const union tgsi_exec_channel *src2)
141{
142   dst->f[0] = src2->f[0] > 0.5f ? src0->f[0] : src1->f[0];
143   dst->f[1] = src2->f[1] > 0.5f ? src0->f[1] : src1->f[1];
144   dst->f[2] = src2->f[2] > 0.5f ? src0->f[2] : src1->f[2];
145   dst->f[3] = src2->f[3] > 0.5f ? src0->f[3] : src1->f[3];
146}
147
148static void
149micro_cos(union tgsi_exec_channel *dst,
150          const union tgsi_exec_channel *src)
151{
152   dst->f[0] = cosf(src->f[0]);
153   dst->f[1] = cosf(src->f[1]);
154   dst->f[2] = cosf(src->f[2]);
155   dst->f[3] = cosf(src->f[3]);
156}
157
158static void
159micro_ddx(union tgsi_exec_channel *dst,
160          const union tgsi_exec_channel *src)
161{
162   dst->f[0] =
163   dst->f[1] =
164   dst->f[2] =
165   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
166}
167
168static void
169micro_ddy(union tgsi_exec_channel *dst,
170          const union tgsi_exec_channel *src)
171{
172   dst->f[0] =
173   dst->f[1] =
174   dst->f[2] =
175   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
176}
177
178static void
179micro_exp2(union tgsi_exec_channel *dst,
180           const union tgsi_exec_channel *src)
181{
182#if FAST_MATH
183   dst->f[0] = util_fast_exp2(src->f[0]);
184   dst->f[1] = util_fast_exp2(src->f[1]);
185   dst->f[2] = util_fast_exp2(src->f[2]);
186   dst->f[3] = util_fast_exp2(src->f[3]);
187#else
188#if DEBUG
189   /* Inf is okay for this instruction, so clamp it to silence assertions. */
190   uint i;
191   union tgsi_exec_channel clamped;
192
193   for (i = 0; i < 4; i++) {
194      if (src->f[i] > 127.99999f) {
195         clamped.f[i] = 127.99999f;
196      } else if (src->f[i] < -126.99999f) {
197         clamped.f[i] = -126.99999f;
198      } else {
199         clamped.f[i] = src->f[i];
200      }
201   }
202   src = &clamped;
203#endif /* DEBUG */
204
205   dst->f[0] = powf(2.0f, src->f[0]);
206   dst->f[1] = powf(2.0f, src->f[1]);
207   dst->f[2] = powf(2.0f, src->f[2]);
208   dst->f[3] = powf(2.0f, src->f[3]);
209#endif /* FAST_MATH */
210}
211
212static void
213micro_flr(union tgsi_exec_channel *dst,
214          const union tgsi_exec_channel *src)
215{
216   dst->f[0] = floorf(src->f[0]);
217   dst->f[1] = floorf(src->f[1]);
218   dst->f[2] = floorf(src->f[2]);
219   dst->f[3] = floorf(src->f[3]);
220}
221
222static void
223micro_frc(union tgsi_exec_channel *dst,
224          const union tgsi_exec_channel *src)
225{
226   dst->f[0] = src->f[0] - floorf(src->f[0]);
227   dst->f[1] = src->f[1] - floorf(src->f[1]);
228   dst->f[2] = src->f[2] - floorf(src->f[2]);
229   dst->f[3] = src->f[3] - floorf(src->f[3]);
230}
231
232static void
233micro_iabs(union tgsi_exec_channel *dst,
234           const union tgsi_exec_channel *src)
235{
236   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
237   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
238   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
239   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
240}
241
242static void
243micro_ineg(union tgsi_exec_channel *dst,
244           const union tgsi_exec_channel *src)
245{
246   dst->i[0] = -src->i[0];
247   dst->i[1] = -src->i[1];
248   dst->i[2] = -src->i[2];
249   dst->i[3] = -src->i[3];
250}
251
252static void
253micro_lg2(union tgsi_exec_channel *dst,
254          const union tgsi_exec_channel *src)
255{
256#if FAST_MATH
257   dst->f[0] = util_fast_log2(src->f[0]);
258   dst->f[1] = util_fast_log2(src->f[1]);
259   dst->f[2] = util_fast_log2(src->f[2]);
260   dst->f[3] = util_fast_log2(src->f[3]);
261#else
262   dst->f[0] = logf(src->f[0]) * 1.442695f;
263   dst->f[1] = logf(src->f[1]) * 1.442695f;
264   dst->f[2] = logf(src->f[2]) * 1.442695f;
265   dst->f[3] = logf(src->f[3]) * 1.442695f;
266#endif
267}
268
269static void
270micro_lrp(union tgsi_exec_channel *dst,
271          const union tgsi_exec_channel *src0,
272          const union tgsi_exec_channel *src1,
273          const union tgsi_exec_channel *src2)
274{
275   dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
276   dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
277   dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
278   dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
279}
280
281static void
282micro_mad(union tgsi_exec_channel *dst,
283          const union tgsi_exec_channel *src0,
284          const union tgsi_exec_channel *src1,
285          const union tgsi_exec_channel *src2)
286{
287   dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
288   dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
289   dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
290   dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
291}
292
293static void
294micro_mov(union tgsi_exec_channel *dst,
295          const union tgsi_exec_channel *src)
296{
297   dst->u[0] = src->u[0];
298   dst->u[1] = src->u[1];
299   dst->u[2] = src->u[2];
300   dst->u[3] = src->u[3];
301}
302
303static void
304micro_rcp(union tgsi_exec_channel *dst,
305          const union tgsi_exec_channel *src)
306{
307#if 0 /* for debugging */
308   assert(src->f[0] != 0.0f);
309   assert(src->f[1] != 0.0f);
310   assert(src->f[2] != 0.0f);
311   assert(src->f[3] != 0.0f);
312#endif
313   dst->f[0] = 1.0f / src->f[0];
314   dst->f[1] = 1.0f / src->f[1];
315   dst->f[2] = 1.0f / src->f[2];
316   dst->f[3] = 1.0f / src->f[3];
317}
318
319static void
320micro_rnd(union tgsi_exec_channel *dst,
321          const union tgsi_exec_channel *src)
322{
323   dst->f[0] = floorf(src->f[0] + 0.5f);
324   dst->f[1] = floorf(src->f[1] + 0.5f);
325   dst->f[2] = floorf(src->f[2] + 0.5f);
326   dst->f[3] = floorf(src->f[3] + 0.5f);
327}
328
329static void
330micro_rsq(union tgsi_exec_channel *dst,
331          const union tgsi_exec_channel *src)
332{
333#if 0 /* for debugging */
334   assert(src->f[0] != 0.0f);
335   assert(src->f[1] != 0.0f);
336   assert(src->f[2] != 0.0f);
337   assert(src->f[3] != 0.0f);
338#endif
339   dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
340   dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
341   dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
342   dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
343}
344
345static void
346micro_seq(union tgsi_exec_channel *dst,
347          const union tgsi_exec_channel *src0,
348          const union tgsi_exec_channel *src1)
349{
350   dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
351   dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
352   dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
353   dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
354}
355
356static void
357micro_sge(union tgsi_exec_channel *dst,
358          const union tgsi_exec_channel *src0,
359          const union tgsi_exec_channel *src1)
360{
361   dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
362   dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
363   dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
364   dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
365}
366
367static void
368micro_sgn(union tgsi_exec_channel *dst,
369          const union tgsi_exec_channel *src)
370{
371   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
372   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
373   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
374   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
375}
376
377static void
378micro_sgt(union tgsi_exec_channel *dst,
379          const union tgsi_exec_channel *src0,
380          const union tgsi_exec_channel *src1)
381{
382   dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
383   dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
384   dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
385   dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
386}
387
388static void
389micro_sin(union tgsi_exec_channel *dst,
390          const union tgsi_exec_channel *src)
391{
392   dst->f[0] = sinf(src->f[0]);
393   dst->f[1] = sinf(src->f[1]);
394   dst->f[2] = sinf(src->f[2]);
395   dst->f[3] = sinf(src->f[3]);
396}
397
398static void
399micro_sle(union tgsi_exec_channel *dst,
400          const union tgsi_exec_channel *src0,
401          const union tgsi_exec_channel *src1)
402{
403   dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
404   dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
405   dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
406   dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
407}
408
409static void
410micro_slt(union tgsi_exec_channel *dst,
411          const union tgsi_exec_channel *src0,
412          const union tgsi_exec_channel *src1)
413{
414   dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
415   dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
416   dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
417   dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
418}
419
420static void
421micro_sne(union tgsi_exec_channel *dst,
422          const union tgsi_exec_channel *src0,
423          const union tgsi_exec_channel *src1)
424{
425   dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
426   dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
427   dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
428   dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
429}
430
431static void
432micro_sfl(union tgsi_exec_channel *dst)
433{
434   dst->f[0] = 0.0f;
435   dst->f[1] = 0.0f;
436   dst->f[2] = 0.0f;
437   dst->f[3] = 0.0f;
438}
439
440static void
441micro_str(union tgsi_exec_channel *dst)
442{
443   dst->f[0] = 1.0f;
444   dst->f[1] = 1.0f;
445   dst->f[2] = 1.0f;
446   dst->f[3] = 1.0f;
447}
448
449static void
450micro_trunc(union tgsi_exec_channel *dst,
451            const union tgsi_exec_channel *src)
452{
453   dst->f[0] = (float)(int)src->f[0];
454   dst->f[1] = (float)(int)src->f[1];
455   dst->f[2] = (float)(int)src->f[2];
456   dst->f[3] = (float)(int)src->f[3];
457}
458
459
460#define CHAN_X  0
461#define CHAN_Y  1
462#define CHAN_Z  2
463#define CHAN_W  3
464
465enum tgsi_exec_datatype {
466   TGSI_EXEC_DATA_FLOAT,
467   TGSI_EXEC_DATA_INT,
468   TGSI_EXEC_DATA_UINT
469};
470
471/*
472 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
473 */
474#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
475#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
476#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
477#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
478#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
479#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
480
481
482/** The execution mask depends on the conditional mask and the loop mask */
483#define UPDATE_EXEC_MASK(MACH) \
484      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
485
486
487static const union tgsi_exec_channel ZeroVec =
488   { { 0.0, 0.0, 0.0, 0.0 } };
489
490static const union tgsi_exec_channel OneVec = {
491   {1.0f, 1.0f, 1.0f, 1.0f}
492};
493
494static const union tgsi_exec_channel P128Vec = {
495   {128.0f, 128.0f, 128.0f, 128.0f}
496};
497
498static const union tgsi_exec_channel M128Vec = {
499   {-128.0f, -128.0f, -128.0f, -128.0f}
500};
501
502
503/**
504 * Assert that none of the float values in 'chan' are infinite or NaN.
505 * NaN and Inf may occur normally during program execution and should
506 * not lead to crashes, etc.  But when debugging, it's helpful to catch
507 * them.
508 */
509static INLINE void
510check_inf_or_nan(const union tgsi_exec_channel *chan)
511{
512   assert(!util_is_inf_or_nan((chan)->f[0]));
513   assert(!util_is_inf_or_nan((chan)->f[1]));
514   assert(!util_is_inf_or_nan((chan)->f[2]));
515   assert(!util_is_inf_or_nan((chan)->f[3]));
516}
517
518
519#ifdef DEBUG
520static void
521print_chan(const char *msg, const union tgsi_exec_channel *chan)
522{
523   debug_printf("%s = {%f, %f, %f, %f}\n",
524                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
525}
526#endif
527
528
529#ifdef DEBUG
530static void
531print_temp(const struct tgsi_exec_machine *mach, uint index)
532{
533   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
534   int i;
535   debug_printf("Temp[%u] =\n", index);
536   for (i = 0; i < 4; i++) {
537      debug_printf("  %c: { %f, %f, %f, %f }\n",
538                   "XYZW"[i],
539                   tmp->xyzw[i].f[0],
540                   tmp->xyzw[i].f[1],
541                   tmp->xyzw[i].f[2],
542                   tmp->xyzw[i].f[3]);
543   }
544}
545#endif
546
547
548void
549tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
550                               unsigned num_bufs,
551                               const void **bufs,
552                               const unsigned *buf_sizes)
553{
554   unsigned i;
555
556   for (i = 0; i < num_bufs; i++) {
557      mach->Consts[i] = bufs[i];
558      mach->ConstsSize[i] = buf_sizes[i];
559   }
560}
561
562
563/**
564 * Check if there's a potential src/dst register data dependency when
565 * using SOA execution.
566 * Example:
567 *   MOV T, T.yxwz;
568 * This would expand into:
569 *   MOV t0, t1;
570 *   MOV t1, t0;
571 *   MOV t2, t3;
572 *   MOV t3, t2;
573 * The second instruction will have the wrong value for t0 if executed as-is.
574 */
575boolean
576tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
577{
578   uint i, chan;
579
580   uint writemask = inst->Dst[0].Register.WriteMask;
581   if (writemask == TGSI_WRITEMASK_X ||
582       writemask == TGSI_WRITEMASK_Y ||
583       writemask == TGSI_WRITEMASK_Z ||
584       writemask == TGSI_WRITEMASK_W ||
585       writemask == TGSI_WRITEMASK_NONE) {
586      /* no chance of data dependency */
587      return FALSE;
588   }
589
590   /* loop over src regs */
591   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
592      if ((inst->Src[i].Register.File ==
593           inst->Dst[0].Register.File) &&
594          ((inst->Src[i].Register.Index ==
595            inst->Dst[0].Register.Index) ||
596           inst->Src[i].Register.Indirect ||
597           inst->Dst[0].Register.Indirect)) {
598         /* loop over dest channels */
599         uint channelsWritten = 0x0;
600         for (chan = 0; chan < NUM_CHANNELS; chan++) {
601            if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
602               /* check if we're reading a channel that's been written */
603               uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
604               if (channelsWritten & (1 << swizzle)) {
605                  return TRUE;
606               }
607
608               channelsWritten |= (1 << chan);
609            }
610         }
611      }
612   }
613   return FALSE;
614}
615
616
617/**
618 * Initialize machine state by expanding tokens to full instructions,
619 * allocating temporary storage, setting up constants, etc.
620 * After this, we can call tgsi_exec_machine_run() many times.
621 */
622void
623tgsi_exec_machine_bind_shader(
624   struct tgsi_exec_machine *mach,
625   const struct tgsi_token *tokens,
626   uint numSamplers,
627   struct tgsi_sampler **samplers)
628{
629   uint k;
630   struct tgsi_parse_context parse;
631   struct tgsi_full_instruction *instructions;
632   struct tgsi_full_declaration *declarations;
633   uint maxInstructions = 10, numInstructions = 0;
634   uint maxDeclarations = 10, numDeclarations = 0;
635
636#if 0
637   tgsi_dump(tokens, 0);
638#endif
639
640   util_init_math();
641
642   if (numSamplers) {
643      assert(samplers);
644   }
645
646   mach->Tokens = tokens;
647   mach->Samplers = samplers;
648
649   if (!tokens) {
650      /* unbind and free all */
651      if (mach->Declarations) {
652         FREE( mach->Declarations );
653      }
654      mach->Declarations = NULL;
655      mach->NumDeclarations = 0;
656
657      if (mach->Instructions) {
658         FREE( mach->Instructions );
659      }
660      mach->Instructions = NULL;
661      mach->NumInstructions = 0;
662
663      return;
664   }
665
666   k = tgsi_parse_init (&parse, mach->Tokens);
667   if (k != TGSI_PARSE_OK) {
668      debug_printf( "Problem parsing!\n" );
669      return;
670   }
671
672   mach->Processor = parse.FullHeader.Processor.Processor;
673   mach->ImmLimit = 0;
674
675   if (mach->Processor == TGSI_PROCESSOR_GEOMETRY &&
676       !mach->UsedGeometryShader) {
677      struct tgsi_exec_vector *inputs =
678         align_malloc(sizeof(struct tgsi_exec_vector) *
679                      TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS,
680                      16);
681      struct tgsi_exec_vector *outputs =
682         align_malloc(sizeof(struct tgsi_exec_vector) *
683                      TGSI_MAX_TOTAL_VERTICES, 16);
684
685      if (!inputs)
686         return;
687      if (!outputs) {
688         align_free(inputs);
689         return;
690      }
691
692      align_free(mach->Inputs);
693      align_free(mach->Outputs);
694
695      mach->Inputs = inputs;
696      mach->Outputs = outputs;
697      mach->UsedGeometryShader = TRUE;
698   }
699
700   declarations = (struct tgsi_full_declaration *)
701      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
702
703   if (!declarations) {
704      return;
705   }
706
707   instructions = (struct tgsi_full_instruction *)
708      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
709
710   if (!instructions) {
711      FREE( declarations );
712      return;
713   }
714
715   while( !tgsi_parse_end_of_tokens( &parse ) ) {
716      uint i;
717
718      tgsi_parse_token( &parse );
719      switch( parse.FullToken.Token.Type ) {
720      case TGSI_TOKEN_TYPE_DECLARATION:
721         /* save expanded declaration */
722         if (numDeclarations == maxDeclarations) {
723            declarations = REALLOC(declarations,
724                                   maxDeclarations
725                                   * sizeof(struct tgsi_full_declaration),
726                                   (maxDeclarations + 10)
727                                   * sizeof(struct tgsi_full_declaration));
728            maxDeclarations += 10;
729         }
730         if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
731            unsigned reg;
732            for (reg = parse.FullToken.FullDeclaration.Range.First;
733                 reg <= parse.FullToken.FullDeclaration.Range.Last;
734                 ++reg) {
735               ++mach->NumOutputs;
736            }
737         }
738         if (parse.FullToken.FullDeclaration.Declaration.File ==
739             TGSI_FILE_IMMEDIATE_ARRAY) {
740            unsigned reg;
741            struct tgsi_full_declaration *decl =
742               &parse.FullToken.FullDeclaration;
743            debug_assert(decl->Range.Last < TGSI_EXEC_NUM_IMMEDIATES);
744            for (reg = decl->Range.First; reg <= decl->Range.Last; ++reg) {
745               for( i = 0; i < 4; i++ ) {
746                  int idx = reg * 4 + i;
747                  mach->ImmArray[reg][i] = decl->ImmediateData.u[idx].Float;
748               }
749            }
750         }
751         memcpy(declarations + numDeclarations,
752                &parse.FullToken.FullDeclaration,
753                sizeof(declarations[0]));
754         numDeclarations++;
755         break;
756
757      case TGSI_TOKEN_TYPE_IMMEDIATE:
758         {
759            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
760            assert( size <= 4 );
761            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
762
763            for( i = 0; i < size; i++ ) {
764               mach->Imms[mach->ImmLimit][i] =
765		  parse.FullToken.FullImmediate.u[i].Float;
766            }
767            mach->ImmLimit += 1;
768         }
769         break;
770
771      case TGSI_TOKEN_TYPE_INSTRUCTION:
772
773         /* save expanded instruction */
774         if (numInstructions == maxInstructions) {
775            instructions = REALLOC(instructions,
776                                   maxInstructions
777                                   * sizeof(struct tgsi_full_instruction),
778                                   (maxInstructions + 10)
779                                   * sizeof(struct tgsi_full_instruction));
780            maxInstructions += 10;
781         }
782
783         memcpy(instructions + numInstructions,
784                &parse.FullToken.FullInstruction,
785                sizeof(instructions[0]));
786
787         numInstructions++;
788         break;
789
790      case TGSI_TOKEN_TYPE_PROPERTY:
791         break;
792
793      default:
794         assert( 0 );
795      }
796   }
797   tgsi_parse_free (&parse);
798
799   if (mach->Declarations) {
800      FREE( mach->Declarations );
801   }
802   mach->Declarations = declarations;
803   mach->NumDeclarations = numDeclarations;
804
805   if (mach->Instructions) {
806      FREE( mach->Instructions );
807   }
808   mach->Instructions = instructions;
809   mach->NumInstructions = numInstructions;
810}
811
812
813struct tgsi_exec_machine *
814tgsi_exec_machine_create( void )
815{
816   struct tgsi_exec_machine *mach;
817   uint i;
818
819   mach = align_malloc( sizeof *mach, 16 );
820   if (!mach)
821      goto fail;
822
823   memset(mach, 0, sizeof(*mach));
824
825   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
826   mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
827   mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
828
829   mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_ATTRIBS, 16);
830   mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_ATTRIBS, 16);
831   if (!mach->Inputs || !mach->Outputs)
832      goto fail;
833
834   /* Setup constants needed by the SSE2 executor. */
835   for( i = 0; i < 4; i++ ) {
836      mach->Temps[TGSI_EXEC_TEMP_00000000_I].xyzw[TGSI_EXEC_TEMP_00000000_C].u[i] = 0x00000000;
837      mach->Temps[TGSI_EXEC_TEMP_7FFFFFFF_I].xyzw[TGSI_EXEC_TEMP_7FFFFFFF_C].u[i] = 0x7FFFFFFF;
838      mach->Temps[TGSI_EXEC_TEMP_80000000_I].xyzw[TGSI_EXEC_TEMP_80000000_C].u[i] = 0x80000000;
839      mach->Temps[TGSI_EXEC_TEMP_FFFFFFFF_I].xyzw[TGSI_EXEC_TEMP_FFFFFFFF_C].u[i] = 0xFFFFFFFF;    /* not used */
840      mach->Temps[TGSI_EXEC_TEMP_ONE_I].xyzw[TGSI_EXEC_TEMP_ONE_C].f[i] = 1.0f;
841      mach->Temps[TGSI_EXEC_TEMP_TWO_I].xyzw[TGSI_EXEC_TEMP_TWO_C].f[i] = 2.0f;    /* not used */
842      mach->Temps[TGSI_EXEC_TEMP_128_I].xyzw[TGSI_EXEC_TEMP_128_C].f[i] = 128.0f;
843      mach->Temps[TGSI_EXEC_TEMP_MINUS_128_I].xyzw[TGSI_EXEC_TEMP_MINUS_128_C].f[i] = -128.0f;
844      mach->Temps[TGSI_EXEC_TEMP_THREE_I].xyzw[TGSI_EXEC_TEMP_THREE_C].f[i] = 3.0f;
845      mach->Temps[TGSI_EXEC_TEMP_HALF_I].xyzw[TGSI_EXEC_TEMP_HALF_C].f[i] = 0.5f;
846   }
847
848#ifdef DEBUG
849   /* silence warnings */
850   (void) print_chan;
851   (void) print_temp;
852#endif
853
854   return mach;
855
856fail:
857   if (mach) {
858      align_free(mach->Inputs);
859      align_free(mach->Outputs);
860      align_free(mach);
861   }
862   return NULL;
863}
864
865
866void
867tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
868{
869   if (mach) {
870      if (mach->Instructions)
871         FREE(mach->Instructions);
872      if (mach->Declarations)
873         FREE(mach->Declarations);
874
875      align_free(mach->Inputs);
876      align_free(mach->Outputs);
877
878      align_free(mach);
879   }
880}
881
882static void
883micro_add(union tgsi_exec_channel *dst,
884          const union tgsi_exec_channel *src0,
885          const union tgsi_exec_channel *src1)
886{
887   dst->f[0] = src0->f[0] + src1->f[0];
888   dst->f[1] = src0->f[1] + src1->f[1];
889   dst->f[2] = src0->f[2] + src1->f[2];
890   dst->f[3] = src0->f[3] + src1->f[3];
891}
892
893static void
894micro_div(
895   union tgsi_exec_channel *dst,
896   const union tgsi_exec_channel *src0,
897   const union tgsi_exec_channel *src1 )
898{
899   if (src1->f[0] != 0) {
900      dst->f[0] = src0->f[0] / src1->f[0];
901   }
902   if (src1->f[1] != 0) {
903      dst->f[1] = src0->f[1] / src1->f[1];
904   }
905   if (src1->f[2] != 0) {
906      dst->f[2] = src0->f[2] / src1->f[2];
907   }
908   if (src1->f[3] != 0) {
909      dst->f[3] = src0->f[3] / src1->f[3];
910   }
911}
912
913static void
914micro_rcc(union tgsi_exec_channel *dst,
915          const union tgsi_exec_channel *src)
916{
917   uint i;
918
919   for (i = 0; i < 4; i++) {
920      float recip = 1.0f / src->f[i];
921
922      if (recip > 0.0f) {
923         if (recip > 1.884467e+019f) {
924            dst->f[i] = 1.884467e+019f;
925         }
926         else if (recip < 5.42101e-020f) {
927            dst->f[i] = 5.42101e-020f;
928         }
929         else {
930            dst->f[i] = recip;
931         }
932      }
933      else {
934         if (recip < -1.884467e+019f) {
935            dst->f[i] = -1.884467e+019f;
936         }
937         else if (recip > -5.42101e-020f) {
938            dst->f[i] = -5.42101e-020f;
939         }
940         else {
941            dst->f[i] = recip;
942         }
943      }
944   }
945}
946
947static void
948micro_lt(
949   union tgsi_exec_channel *dst,
950   const union tgsi_exec_channel *src0,
951   const union tgsi_exec_channel *src1,
952   const union tgsi_exec_channel *src2,
953   const union tgsi_exec_channel *src3 )
954{
955   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
956   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
957   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
958   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
959}
960
961static void
962micro_max(union tgsi_exec_channel *dst,
963          const union tgsi_exec_channel *src0,
964          const union tgsi_exec_channel *src1)
965{
966   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
967   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
968   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
969   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
970}
971
972static void
973micro_min(union tgsi_exec_channel *dst,
974          const union tgsi_exec_channel *src0,
975          const union tgsi_exec_channel *src1)
976{
977   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
978   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
979   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
980   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
981}
982
983static void
984micro_mul(union tgsi_exec_channel *dst,
985          const union tgsi_exec_channel *src0,
986          const union tgsi_exec_channel *src1)
987{
988   dst->f[0] = src0->f[0] * src1->f[0];
989   dst->f[1] = src0->f[1] * src1->f[1];
990   dst->f[2] = src0->f[2] * src1->f[2];
991   dst->f[3] = src0->f[3] * src1->f[3];
992}
993
994static void
995micro_neg(
996   union tgsi_exec_channel *dst,
997   const union tgsi_exec_channel *src )
998{
999   dst->f[0] = -src->f[0];
1000   dst->f[1] = -src->f[1];
1001   dst->f[2] = -src->f[2];
1002   dst->f[3] = -src->f[3];
1003}
1004
1005static void
1006micro_pow(
1007   union tgsi_exec_channel *dst,
1008   const union tgsi_exec_channel *src0,
1009   const union tgsi_exec_channel *src1 )
1010{
1011#if FAST_MATH
1012   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
1013   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
1014   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
1015   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
1016#else
1017   dst->f[0] = powf( src0->f[0], src1->f[0] );
1018   dst->f[1] = powf( src0->f[1], src1->f[1] );
1019   dst->f[2] = powf( src0->f[2], src1->f[2] );
1020   dst->f[3] = powf( src0->f[3], src1->f[3] );
1021#endif
1022}
1023
1024static void
1025micro_sub(union tgsi_exec_channel *dst,
1026          const union tgsi_exec_channel *src0,
1027          const union tgsi_exec_channel *src1)
1028{
1029   dst->f[0] = src0->f[0] - src1->f[0];
1030   dst->f[1] = src0->f[1] - src1->f[1];
1031   dst->f[2] = src0->f[2] - src1->f[2];
1032   dst->f[3] = src0->f[3] - src1->f[3];
1033}
1034
1035static void
1036fetch_src_file_channel(const struct tgsi_exec_machine *mach,
1037                       const uint file,
1038                       const uint swizzle,
1039                       const union tgsi_exec_channel *index,
1040                       const union tgsi_exec_channel *index2D,
1041                       union tgsi_exec_channel *chan)
1042{
1043   uint i;
1044
1045   assert(swizzle < 4);
1046
1047   switch (file) {
1048   case TGSI_FILE_CONSTANT:
1049      for (i = 0; i < QUAD_SIZE; i++) {
1050         assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
1051         assert(mach->Consts[index2D->i[i]]);
1052
1053         if (index->i[i] < 0) {
1054            chan->u[i] = 0;
1055         } else {
1056            /* NOTE: copying the const value as a uint instead of float */
1057            const uint constbuf = index2D->i[i];
1058            const uint *buf = (const uint *)mach->Consts[constbuf];
1059            const int pos = index->i[i] * 4 + swizzle;
1060            /* const buffer bounds check */
1061            if (pos < 0 || pos >= mach->ConstsSize[constbuf]) {
1062               if (0) {
1063                  /* Debug: print warning */
1064                  static int count = 0;
1065                  if (count++ < 100)
1066                     debug_printf("TGSI Exec: const buffer index %d"
1067                                  " out of bounds\n", pos);
1068               }
1069               chan->u[i] = 0;
1070            }
1071            else
1072               chan->u[i] = buf[pos];
1073         }
1074      }
1075      break;
1076
1077   case TGSI_FILE_INPUT:
1078      for (i = 0; i < QUAD_SIZE; i++) {
1079         /*
1080         if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1081            debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
1082                         index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
1083                         index2D->i[i], index->i[i]);
1084                         }*/
1085         int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
1086         assert(pos >= 0);
1087         assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
1088         chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
1089      }
1090      break;
1091
1092   case TGSI_FILE_SYSTEM_VALUE:
1093      /* XXX no swizzling at this point.  Will be needed if we put
1094       * gl_FragCoord, for example, in a sys value register.
1095       */
1096      for (i = 0; i < QUAD_SIZE; i++) {
1097         chan->f[i] = mach->SystemValue[index->i[i]][0];
1098      }
1099      break;
1100
1101   case TGSI_FILE_TEMPORARY:
1102      for (i = 0; i < QUAD_SIZE; i++) {
1103         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1104         assert(index2D->i[i] == 0);
1105
1106         chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1107      }
1108      break;
1109
1110   case TGSI_FILE_TEMPORARY_ARRAY:
1111      for (i = 0; i < QUAD_SIZE; i++) {
1112         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1113         assert(index2D->i[i] < TGSI_EXEC_NUM_TEMP_ARRAYS);
1114
1115         chan->u[i] =
1116            mach->TempArray[index2D->i[i]][index->i[i]].xyzw[swizzle].u[i];
1117      }
1118      break;
1119
1120   case TGSI_FILE_IMMEDIATE:
1121      for (i = 0; i < QUAD_SIZE; i++) {
1122         assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1123         assert(index2D->i[i] == 0);
1124
1125         chan->f[i] = mach->Imms[index->i[i]][swizzle];
1126      }
1127      break;
1128
1129   case TGSI_FILE_IMMEDIATE_ARRAY:
1130      for (i = 0; i < QUAD_SIZE; i++) {
1131         assert(index2D->i[i] == 0);
1132
1133         chan->f[i] = mach->ImmArray[index->i[i]][swizzle];
1134      }
1135      break;
1136
1137   case TGSI_FILE_ADDRESS:
1138      for (i = 0; i < QUAD_SIZE; i++) {
1139         assert(index->i[i] >= 0);
1140         assert(index2D->i[i] == 0);
1141
1142         chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1143      }
1144      break;
1145
1146   case TGSI_FILE_PREDICATE:
1147      for (i = 0; i < QUAD_SIZE; i++) {
1148         assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
1149         assert(index2D->i[i] == 0);
1150
1151         chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
1152      }
1153      break;
1154
1155   case TGSI_FILE_OUTPUT:
1156      /* vertex/fragment output vars can be read too */
1157      for (i = 0; i < QUAD_SIZE; i++) {
1158         assert(index->i[i] >= 0);
1159         assert(index2D->i[i] == 0);
1160
1161         chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1162      }
1163      break;
1164
1165   default:
1166      assert(0);
1167      for (i = 0; i < QUAD_SIZE; i++) {
1168         chan->u[i] = 0;
1169      }
1170   }
1171}
1172
1173static void
1174fetch_source(const struct tgsi_exec_machine *mach,
1175             union tgsi_exec_channel *chan,
1176             const struct tgsi_full_src_register *reg,
1177             const uint chan_index,
1178             enum tgsi_exec_datatype src_datatype)
1179{
1180   union tgsi_exec_channel index;
1181   union tgsi_exec_channel index2D;
1182   uint swizzle;
1183
1184   /* We start with a direct index into a register file.
1185    *
1186    *    file[1],
1187    *    where:
1188    *       file = Register.File
1189    *       [1] = Register.Index
1190    */
1191   index.i[0] =
1192   index.i[1] =
1193   index.i[2] =
1194   index.i[3] = reg->Register.Index;
1195
1196   /* There is an extra source register that indirectly subscripts
1197    * a register file. The direct index now becomes an offset
1198    * that is being added to the indirect register.
1199    *
1200    *    file[ind[2].x+1],
1201    *    where:
1202    *       ind = Indirect.File
1203    *       [2] = Indirect.Index
1204    *       .x = Indirect.SwizzleX
1205    */
1206   if (reg->Register.Indirect) {
1207      union tgsi_exec_channel index2;
1208      union tgsi_exec_channel indir_index;
1209      const uint execmask = mach->ExecMask;
1210      uint i;
1211
1212      /* which address register (always zero now) */
1213      index2.i[0] =
1214      index2.i[1] =
1215      index2.i[2] =
1216      index2.i[3] = reg->Indirect.Index;
1217      assert(reg->Indirect.File == TGSI_FILE_ADDRESS);
1218      /* get current value of address register[swizzle] */
1219      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1220      fetch_src_file_channel(mach,
1221                             reg->Indirect.File,
1222                             swizzle,
1223                             &index2,
1224                             &ZeroVec,
1225                             &indir_index);
1226
1227      /* add value of address register to the offset */
1228      index.i[0] += indir_index.i[0];
1229      index.i[1] += indir_index.i[1];
1230      index.i[2] += indir_index.i[2];
1231      index.i[3] += indir_index.i[3];
1232
1233      /* for disabled execution channels, zero-out the index to
1234       * avoid using a potential garbage value.
1235       */
1236      for (i = 0; i < QUAD_SIZE; i++) {
1237         if ((execmask & (1 << i)) == 0)
1238            index.i[i] = 0;
1239      }
1240   }
1241
1242   /* There is an extra source register that is a second
1243    * subscript to a register file. Effectively it means that
1244    * the register file is actually a 2D array of registers.
1245    *
1246    *    file[3][1],
1247    *    where:
1248    *       [3] = Dimension.Index
1249    */
1250   if (reg->Register.Dimension) {
1251      index2D.i[0] =
1252      index2D.i[1] =
1253      index2D.i[2] =
1254      index2D.i[3] = reg->Dimension.Index;
1255
1256      /* Again, the second subscript index can be addressed indirectly
1257       * identically to the first one.
1258       * Nothing stops us from indirectly addressing the indirect register,
1259       * but there is no need for that, so we won't exercise it.
1260       *
1261       *    file[ind[4].y+3][1],
1262       *    where:
1263       *       ind = DimIndirect.File
1264       *       [4] = DimIndirect.Index
1265       *       .y = DimIndirect.SwizzleX
1266       */
1267      if (reg->Dimension.Indirect) {
1268         union tgsi_exec_channel index2;
1269         union tgsi_exec_channel indir_index;
1270         const uint execmask = mach->ExecMask;
1271         uint i;
1272
1273         index2.i[0] =
1274         index2.i[1] =
1275         index2.i[2] =
1276         index2.i[3] = reg->DimIndirect.Index;
1277
1278         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1279         fetch_src_file_channel(mach,
1280                                reg->DimIndirect.File,
1281                                swizzle,
1282                                &index2,
1283                                &ZeroVec,
1284                                &indir_index);
1285
1286         index2D.i[0] += indir_index.i[0];
1287         index2D.i[1] += indir_index.i[1];
1288         index2D.i[2] += indir_index.i[2];
1289         index2D.i[3] += indir_index.i[3];
1290
1291         /* for disabled execution channels, zero-out the index to
1292          * avoid using a potential garbage value.
1293          */
1294         for (i = 0; i < QUAD_SIZE; i++) {
1295            if ((execmask & (1 << i)) == 0) {
1296               index2D.i[i] = 0;
1297            }
1298         }
1299      }
1300
1301      /* If by any chance there was a need for a 3D array of register
1302       * files, we would have to check whether Dimension is followed
1303       * by a dimension register and continue the saga.
1304       */
1305   } else {
1306      index2D.i[0] =
1307      index2D.i[1] =
1308      index2D.i[2] =
1309      index2D.i[3] = 0;
1310   }
1311
1312   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1313   fetch_src_file_channel(mach,
1314                          reg->Register.File,
1315                          swizzle,
1316                          &index,
1317                          &index2D,
1318                          chan);
1319
1320   if (reg->Register.Absolute) {
1321      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1322         micro_abs(chan, chan);
1323      } else {
1324         micro_iabs(chan, chan);
1325      }
1326   }
1327
1328   if (reg->Register.Negate) {
1329      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1330         micro_neg(chan, chan);
1331      } else {
1332         micro_ineg(chan, chan);
1333      }
1334   }
1335}
1336
1337static void
1338store_dest(struct tgsi_exec_machine *mach,
1339           const union tgsi_exec_channel *chan,
1340           const struct tgsi_full_dst_register *reg,
1341           const struct tgsi_full_instruction *inst,
1342           uint chan_index,
1343           enum tgsi_exec_datatype dst_datatype)
1344{
1345   uint i;
1346   union tgsi_exec_channel null;
1347   union tgsi_exec_channel *dst;
1348   union tgsi_exec_channel index2D;
1349   uint execmask = mach->ExecMask;
1350   int offset = 0;  /* indirection offset */
1351   int index;
1352
1353   /* for debugging */
1354   if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1355      check_inf_or_nan(chan);
1356   }
1357
1358   /* There is an extra source register that indirectly subscripts
1359    * a register file. The direct index now becomes an offset
1360    * that is being added to the indirect register.
1361    *
1362    *    file[ind[2].x+1],
1363    *    where:
1364    *       ind = Indirect.File
1365    *       [2] = Indirect.Index
1366    *       .x = Indirect.SwizzleX
1367    */
1368   if (reg->Register.Indirect) {
1369      union tgsi_exec_channel index;
1370      union tgsi_exec_channel indir_index;
1371      uint swizzle;
1372
1373      /* which address register (always zero for now) */
1374      index.i[0] =
1375      index.i[1] =
1376      index.i[2] =
1377      index.i[3] = reg->Indirect.Index;
1378
1379      /* get current value of address register[swizzle] */
1380      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1381
1382      /* fetch values from the address/indirection register */
1383      fetch_src_file_channel(mach,
1384                             reg->Indirect.File,
1385                             swizzle,
1386                             &index,
1387                             &ZeroVec,
1388                             &indir_index);
1389
1390      /* save indirection offset */
1391      offset = indir_index.i[0];
1392   }
1393
1394   /* There is an extra source register that is a second
1395    * subscript to a register file. Effectively it means that
1396    * the register file is actually a 2D array of registers.
1397    *
1398    *    file[3][1],
1399    *    where:
1400    *       [3] = Dimension.Index
1401    */
1402   if (reg->Register.Dimension) {
1403      index2D.i[0] =
1404      index2D.i[1] =
1405      index2D.i[2] =
1406      index2D.i[3] = reg->Dimension.Index;
1407
1408      /* Again, the second subscript index can be addressed indirectly
1409       * identically to the first one.
1410       * Nothing stops us from indirectly addressing the indirect register,
1411       * but there is no need for that, so we won't exercise it.
1412       *
1413       *    file[ind[4].y+3][1],
1414       *    where:
1415       *       ind = DimIndirect.File
1416       *       [4] = DimIndirect.Index
1417       *       .y = DimIndirect.SwizzleX
1418       */
1419      if (reg->Dimension.Indirect) {
1420         union tgsi_exec_channel index2;
1421         union tgsi_exec_channel indir_index;
1422         const uint execmask = mach->ExecMask;
1423         unsigned swizzle;
1424         uint i;
1425
1426         index2.i[0] =
1427         index2.i[1] =
1428         index2.i[2] =
1429         index2.i[3] = reg->DimIndirect.Index;
1430
1431         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1432         fetch_src_file_channel(mach,
1433                                reg->DimIndirect.File,
1434                                swizzle,
1435                                &index2,
1436                                &ZeroVec,
1437                                &indir_index);
1438
1439         index2D.i[0] += indir_index.i[0];
1440         index2D.i[1] += indir_index.i[1];
1441         index2D.i[2] += indir_index.i[2];
1442         index2D.i[3] += indir_index.i[3];
1443
1444         /* for disabled execution channels, zero-out the index to
1445          * avoid using a potential garbage value.
1446          */
1447         for (i = 0; i < QUAD_SIZE; i++) {
1448            if ((execmask & (1 << i)) == 0) {
1449               index2D.i[i] = 0;
1450            }
1451         }
1452      }
1453
1454      /* If by any chance there was a need for a 3D array of register
1455       * files, we would have to check whether Dimension is followed
1456       * by a dimension register and continue the saga.
1457       */
1458   } else {
1459      index2D.i[0] =
1460      index2D.i[1] =
1461      index2D.i[2] =
1462      index2D.i[3] = 0;
1463   }
1464
1465   switch (reg->Register.File) {
1466   case TGSI_FILE_NULL:
1467      dst = &null;
1468      break;
1469
1470   case TGSI_FILE_OUTPUT:
1471      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1472         + reg->Register.Index;
1473      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1474#if 0
1475      if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1476         fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1477         for (i = 0; i < QUAD_SIZE; i++)
1478            if (execmask & (1 << i))
1479               fprintf(stderr, "%f, ", chan->f[i]);
1480         fprintf(stderr, ")\n");
1481      }
1482#endif
1483      break;
1484
1485   case TGSI_FILE_TEMPORARY:
1486      index = reg->Register.Index;
1487      assert( index < TGSI_EXEC_NUM_TEMPS );
1488      dst = &mach->Temps[offset + index].xyzw[chan_index];
1489      break;
1490
1491   case TGSI_FILE_TEMPORARY_ARRAY:
1492      index = reg->Register.Index;
1493      assert( index < TGSI_EXEC_NUM_TEMPS );
1494      assert( index2D.i[0] < TGSI_EXEC_NUM_TEMP_ARRAYS );
1495      /* XXX we use index2D.i[0] here but somehow we might
1496       * end up with someone trying to store indirectly in
1497       * different buffers */
1498      dst = &mach->TempArray[index2D.i[0]][offset + index].xyzw[chan_index];
1499      break;
1500
1501   case TGSI_FILE_ADDRESS:
1502      index = reg->Register.Index;
1503      dst = &mach->Addrs[index].xyzw[chan_index];
1504      break;
1505
1506   case TGSI_FILE_PREDICATE:
1507      index = reg->Register.Index;
1508      assert(index < TGSI_EXEC_NUM_PREDS);
1509      dst = &mach->Predicates[index].xyzw[chan_index];
1510      break;
1511
1512   default:
1513      assert( 0 );
1514      return;
1515   }
1516
1517   if (inst->Instruction.Predicate) {
1518      uint swizzle;
1519      union tgsi_exec_channel *pred;
1520
1521      switch (chan_index) {
1522      case CHAN_X:
1523         swizzle = inst->Predicate.SwizzleX;
1524         break;
1525      case CHAN_Y:
1526         swizzle = inst->Predicate.SwizzleY;
1527         break;
1528      case CHAN_Z:
1529         swizzle = inst->Predicate.SwizzleZ;
1530         break;
1531      case CHAN_W:
1532         swizzle = inst->Predicate.SwizzleW;
1533         break;
1534      default:
1535         assert(0);
1536         return;
1537      }
1538
1539      assert(inst->Predicate.Index == 0);
1540
1541      pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1542
1543      if (inst->Predicate.Negate) {
1544         for (i = 0; i < QUAD_SIZE; i++) {
1545            if (pred->u[i]) {
1546               execmask &= ~(1 << i);
1547            }
1548         }
1549      } else {
1550         for (i = 0; i < QUAD_SIZE; i++) {
1551            if (!pred->u[i]) {
1552               execmask &= ~(1 << i);
1553            }
1554         }
1555      }
1556   }
1557
1558   switch (inst->Instruction.Saturate) {
1559   case TGSI_SAT_NONE:
1560      for (i = 0; i < QUAD_SIZE; i++)
1561         if (execmask & (1 << i))
1562            dst->i[i] = chan->i[i];
1563      break;
1564
1565   case TGSI_SAT_ZERO_ONE:
1566      for (i = 0; i < QUAD_SIZE; i++)
1567         if (execmask & (1 << i)) {
1568            if (chan->f[i] < 0.0f)
1569               dst->f[i] = 0.0f;
1570            else if (chan->f[i] > 1.0f)
1571               dst->f[i] = 1.0f;
1572            else
1573               dst->i[i] = chan->i[i];
1574         }
1575      break;
1576
1577   case TGSI_SAT_MINUS_PLUS_ONE:
1578      for (i = 0; i < QUAD_SIZE; i++)
1579         if (execmask & (1 << i)) {
1580            if (chan->f[i] < -1.0f)
1581               dst->f[i] = -1.0f;
1582            else if (chan->f[i] > 1.0f)
1583               dst->f[i] = 1.0f;
1584            else
1585               dst->i[i] = chan->i[i];
1586         }
1587      break;
1588
1589   default:
1590      assert( 0 );
1591   }
1592}
1593
1594#define FETCH(VAL,INDEX,CHAN)\
1595    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1596
1597#define IFETCH(VAL,INDEX,CHAN)\
1598    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_INT)
1599
1600
1601/**
1602 * Execute ARB-style KIL which is predicated by a src register.
1603 * Kill fragment if any of the four values is less than zero.
1604 */
1605static void
1606exec_kil(struct tgsi_exec_machine *mach,
1607         const struct tgsi_full_instruction *inst)
1608{
1609   uint uniquemask;
1610   uint chan_index;
1611   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1612   union tgsi_exec_channel r[1];
1613
1614   /* This mask stores component bits that were already tested. */
1615   uniquemask = 0;
1616
1617   for (chan_index = 0; chan_index < 4; chan_index++)
1618   {
1619      uint swizzle;
1620      uint i;
1621
1622      /* unswizzle channel */
1623      swizzle = tgsi_util_get_full_src_register_swizzle (
1624                        &inst->Src[0],
1625                        chan_index);
1626
1627      /* check if the component has not been already tested */
1628      if (uniquemask & (1 << swizzle))
1629         continue;
1630      uniquemask |= 1 << swizzle;
1631
1632      FETCH(&r[0], 0, chan_index);
1633      for (i = 0; i < 4; i++)
1634         if (r[0].f[i] < 0.0f)
1635            kilmask |= 1 << i;
1636   }
1637
1638   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1639}
1640
1641/**
1642 * Execute NVIDIA-style KIL which is predicated by a condition code.
1643 * Kill fragment if the condition code is TRUE.
1644 */
1645static void
1646exec_kilp(struct tgsi_exec_machine *mach,
1647          const struct tgsi_full_instruction *inst)
1648{
1649   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1650
1651   /* "unconditional" kil */
1652   kilmask = mach->ExecMask;
1653   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1654}
1655
1656static void
1657emit_vertex(struct tgsi_exec_machine *mach)
1658{
1659   /* FIXME: check for exec mask correctly
1660   unsigned i;
1661   for (i = 0; i < QUAD_SIZE; ++i) {
1662         if ((mach->ExecMask & (1 << i)))
1663   */
1664   if (mach->ExecMask) {
1665      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1666      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1667   }
1668}
1669
1670static void
1671emit_primitive(struct tgsi_exec_machine *mach)
1672{
1673   unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1674   /* FIXME: check for exec mask correctly
1675   unsigned i;
1676   for (i = 0; i < QUAD_SIZE; ++i) {
1677         if ((mach->ExecMask & (1 << i)))
1678   */
1679   if (mach->ExecMask) {
1680      ++(*prim_count);
1681      debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1682      mach->Primitives[*prim_count] = 0;
1683   }
1684}
1685
1686static void
1687conditional_emit_primitive(struct tgsi_exec_machine *mach)
1688{
1689   if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1690      int emitted_verts =
1691         mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]];
1692      if (emitted_verts) {
1693         emit_primitive(mach);
1694      }
1695   }
1696}
1697
1698
1699/*
1700 * Fetch four texture samples using STR texture coordinates.
1701 */
1702static void
1703fetch_texel( struct tgsi_sampler *sampler,
1704             const union tgsi_exec_channel *s,
1705             const union tgsi_exec_channel *t,
1706             const union tgsi_exec_channel *p,
1707             const union tgsi_exec_channel *c0,
1708             enum tgsi_sampler_control control,
1709             union tgsi_exec_channel *r,
1710             union tgsi_exec_channel *g,
1711             union tgsi_exec_channel *b,
1712             union tgsi_exec_channel *a )
1713{
1714   uint j;
1715   float rgba[NUM_CHANNELS][QUAD_SIZE];
1716
1717   sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
1718
1719   for (j = 0; j < 4; j++) {
1720      r->f[j] = rgba[0][j];
1721      g->f[j] = rgba[1][j];
1722      b->f[j] = rgba[2][j];
1723      a->f[j] = rgba[3][j];
1724   }
1725}
1726
1727
1728#define TEX_MODIFIER_NONE           0
1729#define TEX_MODIFIER_PROJECTED      1
1730#define TEX_MODIFIER_LOD_BIAS       2
1731#define TEX_MODIFIER_EXPLICIT_LOD   3
1732
1733
1734static void
1735exec_tex(struct tgsi_exec_machine *mach,
1736         const struct tgsi_full_instruction *inst,
1737         uint modifier)
1738{
1739   const uint unit = inst->Src[1].Register.Index;
1740   union tgsi_exec_channel r[4];
1741   const union tgsi_exec_channel *lod = &ZeroVec;
1742   enum tgsi_sampler_control control;
1743   uint chan;
1744
1745   if (modifier != TEX_MODIFIER_NONE) {
1746      FETCH(&r[3], 0, CHAN_W);
1747      if (modifier != TEX_MODIFIER_PROJECTED) {
1748         lod = &r[3];
1749      }
1750   }
1751
1752   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
1753      control = tgsi_sampler_lod_explicit;
1754   } else {
1755      control = tgsi_sampler_lod_bias;
1756   }
1757
1758   switch (inst->Texture.Texture) {
1759   case TGSI_TEXTURE_1D:
1760   case TGSI_TEXTURE_SHADOW1D:
1761      FETCH(&r[0], 0, CHAN_X);
1762
1763      if (modifier == TEX_MODIFIER_PROJECTED) {
1764         micro_div(&r[0], &r[0], &r[3]);
1765      }
1766
1767      fetch_texel(mach->Samplers[unit],
1768                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
1769                  control,
1770                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1771      break;
1772
1773   case TGSI_TEXTURE_2D:
1774   case TGSI_TEXTURE_RECT:
1775   case TGSI_TEXTURE_SHADOW2D:
1776   case TGSI_TEXTURE_SHADOWRECT:
1777      FETCH(&r[0], 0, CHAN_X);
1778      FETCH(&r[1], 0, CHAN_Y);
1779      FETCH(&r[2], 0, CHAN_Z);
1780
1781      if (modifier == TEX_MODIFIER_PROJECTED) {
1782         micro_div(&r[0], &r[0], &r[3]);
1783         micro_div(&r[1], &r[1], &r[3]);
1784         micro_div(&r[2], &r[2], &r[3]);
1785      }
1786
1787      fetch_texel(mach->Samplers[unit],
1788                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1789                  control,
1790                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1791      break;
1792
1793   case TGSI_TEXTURE_1D_ARRAY:
1794      FETCH(&r[0], 0, CHAN_X);
1795      FETCH(&r[1], 0, CHAN_Y);
1796
1797      if (modifier == TEX_MODIFIER_PROJECTED) {
1798         micro_div(&r[0], &r[0], &r[3]);
1799      }
1800
1801      fetch_texel(mach->Samplers[unit],
1802                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1803                  control,
1804                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1805      break;
1806
1807   case TGSI_TEXTURE_2D_ARRAY:
1808      FETCH(&r[0], 0, CHAN_X);
1809      FETCH(&r[1], 0, CHAN_Y);
1810      FETCH(&r[2], 0, CHAN_Z);
1811
1812      if (modifier == TEX_MODIFIER_PROJECTED) {
1813         micro_div(&r[0], &r[0], &r[3]);
1814         micro_div(&r[1], &r[1], &r[3]);
1815      }
1816
1817      fetch_texel(mach->Samplers[unit],
1818                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1819                  control,
1820                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1821      break;
1822
1823   case TGSI_TEXTURE_3D:
1824   case TGSI_TEXTURE_CUBE:
1825      FETCH(&r[0], 0, CHAN_X);
1826      FETCH(&r[1], 0, CHAN_Y);
1827      FETCH(&r[2], 0, CHAN_Z);
1828
1829      if (modifier == TEX_MODIFIER_PROJECTED) {
1830         micro_div(&r[0], &r[0], &r[3]);
1831         micro_div(&r[1], &r[1], &r[3]);
1832         micro_div(&r[2], &r[2], &r[3]);
1833      }
1834
1835      fetch_texel(mach->Samplers[unit],
1836                  &r[0], &r[1], &r[2], lod,
1837                  control,
1838                  &r[0], &r[1], &r[2], &r[3]);
1839      break;
1840
1841   default:
1842      assert(0);
1843   }
1844
1845#if 0
1846   debug_printf("fetch r: %g %g %g %g\n",
1847         r[0].f[0], r[0].f[1], r[0].f[2], r[0].f[3]);
1848   debug_printf("fetch g: %g %g %g %g\n",
1849         r[1].f[0], r[1].f[1], r[1].f[2], r[1].f[3]);
1850   debug_printf("fetch b: %g %g %g %g\n",
1851         r[2].f[0], r[2].f[1], r[2].f[2], r[2].f[3]);
1852   debug_printf("fetch a: %g %g %g %g\n",
1853         r[3].f[0], r[3].f[1], r[3].f[2], r[3].f[3]);
1854#endif
1855
1856   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1857      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1858         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1859      }
1860   }
1861}
1862
1863static void
1864exec_txd(struct tgsi_exec_machine *mach,
1865         const struct tgsi_full_instruction *inst)
1866{
1867   const uint unit = inst->Src[3].Register.Index;
1868   union tgsi_exec_channel r[4];
1869   uint chan;
1870
1871   /*
1872    * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1873    */
1874
1875   switch (inst->Texture.Texture) {
1876   case TGSI_TEXTURE_1D:
1877   case TGSI_TEXTURE_SHADOW1D:
1878
1879      FETCH(&r[0], 0, CHAN_X);
1880
1881      fetch_texel(mach->Samplers[unit],
1882                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
1883                  tgsi_sampler_lod_bias,
1884                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
1885      break;
1886
1887   case TGSI_TEXTURE_2D:
1888   case TGSI_TEXTURE_RECT:
1889   case TGSI_TEXTURE_SHADOW2D:
1890   case TGSI_TEXTURE_SHADOWRECT:
1891
1892      FETCH(&r[0], 0, CHAN_X);
1893      FETCH(&r[1], 0, CHAN_Y);
1894      FETCH(&r[2], 0, CHAN_Z);
1895
1896      fetch_texel(mach->Samplers[unit],
1897                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
1898                  tgsi_sampler_lod_bias,
1899                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
1900      break;
1901
1902   case TGSI_TEXTURE_3D:
1903   case TGSI_TEXTURE_CUBE:
1904
1905      FETCH(&r[0], 0, CHAN_X);
1906      FETCH(&r[1], 0, CHAN_Y);
1907      FETCH(&r[2], 0, CHAN_Z);
1908
1909      fetch_texel(mach->Samplers[unit],
1910                  &r[0], &r[1], &r[2], &ZeroVec,
1911                  tgsi_sampler_lod_bias,
1912                  &r[0], &r[1], &r[2], &r[3]);
1913      break;
1914
1915   default:
1916      assert(0);
1917   }
1918
1919   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1920      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1921         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1922      }
1923   }
1924}
1925
1926
1927static void
1928exec_txf(struct tgsi_exec_machine *mach,
1929	 const struct tgsi_full_instruction *inst)
1930{
1931   struct tgsi_sampler *sampler;
1932   const uint unit = inst->Src[2].Register.Index;
1933   union tgsi_exec_channel r[4];
1934   union tgsi_exec_channel offset[3];
1935   uint chan;
1936   float rgba[NUM_CHANNELS][QUAD_SIZE];
1937   int j;
1938   int8_t offsets[3];
1939
1940   if (inst->Texture.NumOffsets == 1) {
1941      union tgsi_exec_channel index;
1942      index.i[0] = index.i[1] = index.i[2] = index.i[3] = inst->TexOffsets[0].Index;
1943      fetch_src_file_channel(mach, inst->TexOffsets[0].File,
1944                             inst->TexOffsets[0].SwizzleX, &index, &ZeroVec, &offset[0]);
1945      fetch_src_file_channel(mach, inst->TexOffsets[0].File,
1946                             inst->TexOffsets[0].SwizzleY, &index, &ZeroVec, &offset[1]);
1947      fetch_src_file_channel(mach, inst->TexOffsets[0].File,
1948                             inst->TexOffsets[0].SwizzleZ, &index, &ZeroVec, &offset[2]);
1949     offsets[0] = offset[0].i[0];
1950     offsets[1] = offset[1].i[0];
1951     offsets[2] = offset[2].i[0];
1952   } else
1953     offsets[0] = offsets[1] = offsets[2] = 0;
1954
1955   IFETCH(&r[3], 0, CHAN_W);
1956
1957   switch(inst->Texture.Texture) {
1958   case TGSI_TEXTURE_3D:
1959   case TGSI_TEXTURE_2D_ARRAY:
1960      IFETCH(&r[2], 0, CHAN_Z);
1961      /* fallthrough */
1962   case TGSI_TEXTURE_2D:
1963   case TGSI_TEXTURE_RECT:
1964   case TGSI_TEXTURE_SHADOW2D:
1965   case TGSI_TEXTURE_SHADOWRECT:
1966   case TGSI_TEXTURE_1D_ARRAY:
1967      IFETCH(&r[1], 0, CHAN_Y);
1968      /* fallthrough */
1969   case TGSI_TEXTURE_1D:
1970   case TGSI_TEXTURE_SHADOW1D:
1971      IFETCH(&r[0], 0, CHAN_X);
1972      break;
1973   default:
1974      assert(0);
1975      break;
1976   }
1977
1978   sampler = mach->Samplers[unit];
1979   sampler->get_texel(sampler, r[0].i, r[1].i, r[2].i, r[3].i,
1980		      offsets, rgba);
1981
1982   for (j = 0; j < QUAD_SIZE; j++) {
1983      r[0].f[j] = rgba[0][j];
1984      r[1].f[j] = rgba[1][j];
1985      r[2].f[j] = rgba[2][j];
1986      r[3].f[j] = rgba[3][j];
1987   }
1988
1989   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1990      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1991         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1992      }
1993   }
1994}
1995
1996static void
1997exec_txq(struct tgsi_exec_machine *mach,
1998         const struct tgsi_full_instruction *inst)
1999{
2000   struct tgsi_sampler *sampler;
2001   const uint unit = inst->Src[1].Register.Index;
2002   int result[4];
2003   union tgsi_exec_channel r[4], src;
2004   uint chan;
2005   int i,j;
2006
2007   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_INT);
2008   sampler = mach->Samplers[unit];
2009
2010   sampler->get_dims(sampler, src.i[0], result);
2011
2012   for (i = 0; i < QUAD_SIZE; i++) {
2013      for (j = 0; j < 4; j++) {
2014	 r[j].i[i] = result[j];
2015      }
2016   }
2017
2018   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2019      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2020	 store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
2021		    TGSI_EXEC_DATA_INT);
2022      }
2023   }
2024}
2025
2026static void
2027exec_sample(struct tgsi_exec_machine *mach,
2028            const struct tgsi_full_instruction *inst,
2029            uint modifier)
2030{
2031   const uint resource_unit = inst->Src[1].Register.Index;
2032   const uint sampler_unit = inst->Src[2].Register.Index;
2033   union tgsi_exec_channel r[4];
2034   const union tgsi_exec_channel *lod = &ZeroVec;
2035   enum tgsi_sampler_control control;
2036   uint chan;
2037
2038   if (modifier != TEX_MODIFIER_NONE) {
2039      if (modifier == TEX_MODIFIER_LOD_BIAS)
2040         FETCH(&r[3], 3, CHAN_X);
2041      else /*TEX_MODIFIER_LOD*/
2042         FETCH(&r[3], 0, CHAN_W);
2043
2044      if (modifier != TEX_MODIFIER_PROJECTED) {
2045         lod = &r[3];
2046      }
2047   }
2048
2049   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
2050      control = tgsi_sampler_lod_explicit;
2051   } else {
2052      control = tgsi_sampler_lod_bias;
2053   }
2054
2055   switch (mach->Resources[resource_unit].Resource) {
2056   case TGSI_TEXTURE_1D:
2057   case TGSI_TEXTURE_SHADOW1D:
2058      FETCH(&r[0], 0, CHAN_X);
2059
2060      if (modifier == TEX_MODIFIER_PROJECTED) {
2061         micro_div(&r[0], &r[0], &r[3]);
2062      }
2063
2064      fetch_texel(mach->Samplers[sampler_unit],
2065                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
2066                  control,
2067                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2068      break;
2069
2070   case TGSI_TEXTURE_2D:
2071   case TGSI_TEXTURE_RECT:
2072   case TGSI_TEXTURE_SHADOW2D:
2073   case TGSI_TEXTURE_SHADOWRECT:
2074      FETCH(&r[0], 0, CHAN_X);
2075      FETCH(&r[1], 0, CHAN_Y);
2076      FETCH(&r[2], 0, CHAN_Z);
2077
2078      if (modifier == TEX_MODIFIER_PROJECTED) {
2079         micro_div(&r[0], &r[0], &r[3]);
2080         micro_div(&r[1], &r[1], &r[3]);
2081         micro_div(&r[2], &r[2], &r[3]);
2082      }
2083
2084      fetch_texel(mach->Samplers[sampler_unit],
2085                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
2086                  control,
2087                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2088      break;
2089
2090   case TGSI_TEXTURE_3D:
2091   case TGSI_TEXTURE_CUBE:
2092      FETCH(&r[0], 0, CHAN_X);
2093      FETCH(&r[1], 0, CHAN_Y);
2094      FETCH(&r[2], 0, CHAN_Z);
2095
2096      if (modifier == TEX_MODIFIER_PROJECTED) {
2097         micro_div(&r[0], &r[0], &r[3]);
2098         micro_div(&r[1], &r[1], &r[3]);
2099         micro_div(&r[2], &r[2], &r[3]);
2100      }
2101
2102      fetch_texel(mach->Samplers[sampler_unit],
2103                  &r[0], &r[1], &r[2], lod,
2104                  control,
2105                  &r[0], &r[1], &r[2], &r[3]);
2106      break;
2107
2108   default:
2109      assert(0);
2110   }
2111
2112   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2113      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2114         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2115      }
2116   }
2117}
2118
2119static void
2120exec_sample_d(struct tgsi_exec_machine *mach,
2121              const struct tgsi_full_instruction *inst)
2122{
2123   const uint resource_unit = inst->Src[1].Register.Index;
2124   const uint sampler_unit = inst->Src[2].Register.Index;
2125   union tgsi_exec_channel r[4];
2126   uint chan;
2127   /*
2128    * XXX: This is fake SAMPLE_D -- the derivatives are not taken into account, yet.
2129    */
2130
2131   switch (mach->Resources[resource_unit].Resource) {
2132   case TGSI_TEXTURE_1D:
2133   case TGSI_TEXTURE_SHADOW1D:
2134
2135      FETCH(&r[0], 0, CHAN_X);
2136
2137      fetch_texel(mach->Samplers[sampler_unit],
2138                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
2139                  tgsi_sampler_lod_bias,
2140                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2141      break;
2142
2143   case TGSI_TEXTURE_2D:
2144   case TGSI_TEXTURE_RECT:
2145   case TGSI_TEXTURE_SHADOW2D:
2146   case TGSI_TEXTURE_SHADOWRECT:
2147
2148      FETCH(&r[0], 0, CHAN_X);
2149      FETCH(&r[1], 0, CHAN_Y);
2150      FETCH(&r[2], 0, CHAN_Z);
2151
2152      fetch_texel(mach->Samplers[sampler_unit],
2153                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
2154                  tgsi_sampler_lod_bias,
2155                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2156      break;
2157
2158   case TGSI_TEXTURE_3D:
2159   case TGSI_TEXTURE_CUBE:
2160
2161      FETCH(&r[0], 0, CHAN_X);
2162      FETCH(&r[1], 0, CHAN_Y);
2163      FETCH(&r[2], 0, CHAN_Z);
2164
2165      fetch_texel(mach->Samplers[sampler_unit],
2166                  &r[0], &r[1], &r[2], &ZeroVec,
2167                  tgsi_sampler_lod_bias,
2168                  &r[0], &r[1], &r[2], &r[3]);
2169      break;
2170
2171   default:
2172      assert(0);
2173   }
2174
2175   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2176      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2177         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2178      }
2179   }
2180}
2181
2182
2183/**
2184 * Evaluate a constant-valued coefficient at the position of the
2185 * current quad.
2186 */
2187static void
2188eval_constant_coef(
2189   struct tgsi_exec_machine *mach,
2190   unsigned attrib,
2191   unsigned chan )
2192{
2193   unsigned i;
2194
2195   for( i = 0; i < QUAD_SIZE; i++ ) {
2196      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
2197   }
2198}
2199
2200/**
2201 * Evaluate a linear-valued coefficient at the position of the
2202 * current quad.
2203 */
2204static void
2205eval_linear_coef(
2206   struct tgsi_exec_machine *mach,
2207   unsigned attrib,
2208   unsigned chan )
2209{
2210   const float x = mach->QuadPos.xyzw[0].f[0];
2211   const float y = mach->QuadPos.xyzw[1].f[0];
2212   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2213   const float dady = mach->InterpCoefs[attrib].dady[chan];
2214   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2215   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
2216   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
2217   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
2218   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
2219}
2220
2221/**
2222 * Evaluate a perspective-valued coefficient at the position of the
2223 * current quad.
2224 */
2225static void
2226eval_perspective_coef(
2227   struct tgsi_exec_machine *mach,
2228   unsigned attrib,
2229   unsigned chan )
2230{
2231   const float x = mach->QuadPos.xyzw[0].f[0];
2232   const float y = mach->QuadPos.xyzw[1].f[0];
2233   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2234   const float dady = mach->InterpCoefs[attrib].dady[chan];
2235   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2236   const float *w = mach->QuadPos.xyzw[3].f;
2237   /* divide by W here */
2238   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
2239   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
2240   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
2241   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
2242}
2243
2244
2245typedef void (* eval_coef_func)(
2246   struct tgsi_exec_machine *mach,
2247   unsigned attrib,
2248   unsigned chan );
2249
2250static void
2251exec_declaration(struct tgsi_exec_machine *mach,
2252                 const struct tgsi_full_declaration *decl)
2253{
2254   if (decl->Declaration.File == TGSI_FILE_RESOURCE) {
2255      mach->Resources[decl->Range.First] = decl->Resource;
2256      return;
2257   }
2258
2259   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
2260      if (decl->Declaration.File == TGSI_FILE_INPUT) {
2261         uint first, last, mask;
2262
2263         first = decl->Range.First;
2264         last = decl->Range.Last;
2265         mask = decl->Declaration.UsageMask;
2266
2267         /* XXX we could remove this special-case code since
2268          * mach->InterpCoefs[first].a0 should already have the
2269          * front/back-face value.  But we should first update the
2270          * ureg code to emit the right UsageMask value (WRITEMASK_X).
2271          * Then, we could remove the tgsi_exec_machine::Face field.
2272          */
2273         /* XXX make FACE a system value */
2274         if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
2275            uint i;
2276
2277            assert(decl->Semantic.Index == 0);
2278            assert(first == last);
2279
2280            for (i = 0; i < QUAD_SIZE; i++) {
2281               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
2282            }
2283         } else {
2284            eval_coef_func eval;
2285            uint i, j;
2286
2287            switch (decl->Declaration.Interpolate) {
2288            case TGSI_INTERPOLATE_CONSTANT:
2289               eval = eval_constant_coef;
2290               break;
2291
2292            case TGSI_INTERPOLATE_LINEAR:
2293               eval = eval_linear_coef;
2294               break;
2295
2296            case TGSI_INTERPOLATE_PERSPECTIVE:
2297               eval = eval_perspective_coef;
2298               break;
2299
2300            default:
2301               assert(0);
2302               return;
2303            }
2304
2305            for (j = 0; j < NUM_CHANNELS; j++) {
2306               if (mask & (1 << j)) {
2307                  for (i = first; i <= last; i++) {
2308                     eval(mach, i, j);
2309                  }
2310               }
2311            }
2312         }
2313      }
2314   }
2315
2316   if (decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
2317      mach->SysSemanticToIndex[decl->Declaration.Semantic] = decl->Range.First;
2318   }
2319}
2320
2321
2322typedef void (* micro_op)(union tgsi_exec_channel *dst);
2323
2324static void
2325exec_vector(struct tgsi_exec_machine *mach,
2326            const struct tgsi_full_instruction *inst,
2327            micro_op op,
2328            enum tgsi_exec_datatype dst_datatype)
2329{
2330   unsigned int chan;
2331
2332   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2333      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2334         union tgsi_exec_channel dst;
2335
2336         op(&dst);
2337         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2338      }
2339   }
2340}
2341
2342typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
2343                                const union tgsi_exec_channel *src);
2344
2345static void
2346exec_scalar_unary(struct tgsi_exec_machine *mach,
2347                  const struct tgsi_full_instruction *inst,
2348                  micro_unary_op op,
2349                  enum tgsi_exec_datatype dst_datatype,
2350                  enum tgsi_exec_datatype src_datatype)
2351{
2352   unsigned int chan;
2353   union tgsi_exec_channel src;
2354   union tgsi_exec_channel dst;
2355
2356   fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
2357   op(&dst, &src);
2358   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2359      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2360         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2361      }
2362   }
2363}
2364
2365static void
2366exec_vector_unary(struct tgsi_exec_machine *mach,
2367                  const struct tgsi_full_instruction *inst,
2368                  micro_unary_op op,
2369                  enum tgsi_exec_datatype dst_datatype,
2370                  enum tgsi_exec_datatype src_datatype)
2371{
2372   unsigned int chan;
2373   struct tgsi_exec_vector dst;
2374
2375   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2376      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2377         union tgsi_exec_channel src;
2378
2379         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
2380         op(&dst.xyzw[chan], &src);
2381      }
2382   }
2383   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2384      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2385         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2386      }
2387   }
2388}
2389
2390typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
2391                                 const union tgsi_exec_channel *src0,
2392                                 const union tgsi_exec_channel *src1);
2393
2394static void
2395exec_scalar_binary(struct tgsi_exec_machine *mach,
2396                   const struct tgsi_full_instruction *inst,
2397                   micro_binary_op op,
2398                   enum tgsi_exec_datatype dst_datatype,
2399                   enum tgsi_exec_datatype src_datatype)
2400{
2401   unsigned int chan;
2402   union tgsi_exec_channel src[2];
2403   union tgsi_exec_channel dst;
2404
2405   fetch_source(mach, &src[0], &inst->Src[0], CHAN_X, src_datatype);
2406   fetch_source(mach, &src[1], &inst->Src[1], CHAN_Y, src_datatype);
2407   op(&dst, &src[0], &src[1]);
2408   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2409      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2410         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2411      }
2412   }
2413}
2414
2415static void
2416exec_vector_binary(struct tgsi_exec_machine *mach,
2417                   const struct tgsi_full_instruction *inst,
2418                   micro_binary_op op,
2419                   enum tgsi_exec_datatype dst_datatype,
2420                   enum tgsi_exec_datatype src_datatype)
2421{
2422   unsigned int chan;
2423   struct tgsi_exec_vector dst;
2424
2425   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2426      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2427         union tgsi_exec_channel src[2];
2428
2429         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2430         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2431         op(&dst.xyzw[chan], &src[0], &src[1]);
2432      }
2433   }
2434   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2435      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2436         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2437      }
2438   }
2439}
2440
2441typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
2442                                  const union tgsi_exec_channel *src0,
2443                                  const union tgsi_exec_channel *src1,
2444                                  const union tgsi_exec_channel *src2);
2445
2446static void
2447exec_vector_trinary(struct tgsi_exec_machine *mach,
2448                    const struct tgsi_full_instruction *inst,
2449                    micro_trinary_op op,
2450                    enum tgsi_exec_datatype dst_datatype,
2451                    enum tgsi_exec_datatype src_datatype)
2452{
2453   unsigned int chan;
2454   struct tgsi_exec_vector dst;
2455
2456   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2457      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2458         union tgsi_exec_channel src[3];
2459
2460         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2461         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2462         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
2463         op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
2464      }
2465   }
2466   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2467      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2468         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2469      }
2470   }
2471}
2472
2473static void
2474exec_dp3(struct tgsi_exec_machine *mach,
2475         const struct tgsi_full_instruction *inst)
2476{
2477   unsigned int chan;
2478   union tgsi_exec_channel arg[3];
2479
2480   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2481   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2482   micro_mul(&arg[2], &arg[0], &arg[1]);
2483
2484   for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2485      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2486      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2487      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2488   }
2489
2490   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2491      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2492         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2493      }
2494   }
2495}
2496
2497static void
2498exec_dp4(struct tgsi_exec_machine *mach,
2499         const struct tgsi_full_instruction *inst)
2500{
2501   unsigned int chan;
2502   union tgsi_exec_channel arg[3];
2503
2504   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2505   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2506   micro_mul(&arg[2], &arg[0], &arg[1]);
2507
2508   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2509      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2510      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2511      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2512   }
2513
2514   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2515      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2516         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2517      }
2518   }
2519}
2520
2521static void
2522exec_dp2a(struct tgsi_exec_machine *mach,
2523          const struct tgsi_full_instruction *inst)
2524{
2525   unsigned int chan;
2526   union tgsi_exec_channel arg[3];
2527
2528   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2529   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2530   micro_mul(&arg[2], &arg[0], &arg[1]);
2531
2532   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2533   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2534   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2535
2536   fetch_source(mach, &arg[1], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2537   micro_add(&arg[0], &arg[0], &arg[1]);
2538
2539   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2540      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2541         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2542      }
2543   }
2544}
2545
2546static void
2547exec_dph(struct tgsi_exec_machine *mach,
2548         const struct tgsi_full_instruction *inst)
2549{
2550   unsigned int chan;
2551   union tgsi_exec_channel arg[3];
2552
2553   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2554   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2555   micro_mul(&arg[2], &arg[0], &arg[1]);
2556
2557   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2558   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2559   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2560
2561   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2562   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2563   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2564
2565   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2566   micro_add(&arg[0], &arg[0], &arg[1]);
2567
2568   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2569      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2570         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2571      }
2572   }
2573}
2574
2575static void
2576exec_dp2(struct tgsi_exec_machine *mach,
2577         const struct tgsi_full_instruction *inst)
2578{
2579   unsigned int chan;
2580   union tgsi_exec_channel arg[3];
2581
2582   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2583   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2584   micro_mul(&arg[2], &arg[0], &arg[1]);
2585
2586   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2587   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2588   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2589
2590   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2591      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2592         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2593      }
2594   }
2595}
2596
2597static void
2598exec_nrm4(struct tgsi_exec_machine *mach,
2599          const struct tgsi_full_instruction *inst)
2600{
2601   unsigned int chan;
2602   union tgsi_exec_channel arg[4];
2603   union tgsi_exec_channel scale;
2604
2605   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2606   micro_mul(&scale, &arg[0], &arg[0]);
2607
2608   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2609      union tgsi_exec_channel product;
2610
2611      fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2612      micro_mul(&product, &arg[chan], &arg[chan]);
2613      micro_add(&scale, &scale, &product);
2614   }
2615
2616   micro_rsq(&scale, &scale);
2617
2618   for (chan = CHAN_X; chan <= CHAN_W; chan++) {
2619      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2620         micro_mul(&arg[chan], &arg[chan], &scale);
2621         store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2622      }
2623   }
2624}
2625
2626static void
2627exec_nrm3(struct tgsi_exec_machine *mach,
2628          const struct tgsi_full_instruction *inst)
2629{
2630   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2631      unsigned int chan;
2632      union tgsi_exec_channel arg[3];
2633      union tgsi_exec_channel scale;
2634
2635      fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2636      micro_mul(&scale, &arg[0], &arg[0]);
2637
2638      for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2639         union tgsi_exec_channel product;
2640
2641         fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2642         micro_mul(&product, &arg[chan], &arg[chan]);
2643         micro_add(&scale, &scale, &product);
2644      }
2645
2646      micro_rsq(&scale, &scale);
2647
2648      for (chan = CHAN_X; chan <= CHAN_Z; chan++) {
2649         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2650            micro_mul(&arg[chan], &arg[chan], &scale);
2651            store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2652         }
2653      }
2654   }
2655
2656   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2657      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2658   }
2659}
2660
2661static void
2662exec_scs(struct tgsi_exec_machine *mach,
2663         const struct tgsi_full_instruction *inst)
2664{
2665   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) {
2666      union tgsi_exec_channel arg;
2667      union tgsi_exec_channel result;
2668
2669      fetch_source(mach, &arg, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2670
2671      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2672         micro_cos(&result, &arg);
2673         store_dest(mach, &result, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2674      }
2675      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2676         micro_sin(&result, &arg);
2677         store_dest(mach, &result, &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2678      }
2679   }
2680   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2681      store_dest(mach, &ZeroVec, &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2682   }
2683   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2684      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2685   }
2686}
2687
2688static void
2689exec_x2d(struct tgsi_exec_machine *mach,
2690         const struct tgsi_full_instruction *inst)
2691{
2692   union tgsi_exec_channel r[4];
2693   union tgsi_exec_channel d[2];
2694
2695   fetch_source(mach, &r[0], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2696   fetch_source(mach, &r[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2697   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XZ) {
2698      fetch_source(mach, &r[2], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2699      micro_mul(&r[2], &r[2], &r[0]);
2700      fetch_source(mach, &r[3], &inst->Src[2], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2701      micro_mul(&r[3], &r[3], &r[1]);
2702      micro_add(&r[2], &r[2], &r[3]);
2703      fetch_source(mach, &r[3], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2704      micro_add(&d[0], &r[2], &r[3]);
2705   }
2706   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YW) {
2707      fetch_source(mach, &r[2], &inst->Src[2], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2708      micro_mul(&r[2], &r[2], &r[0]);
2709      fetch_source(mach, &r[3], &inst->Src[2], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2710      micro_mul(&r[3], &r[3], &r[1]);
2711      micro_add(&r[2], &r[2], &r[3]);
2712      fetch_source(mach, &r[3], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2713      micro_add(&d[1], &r[2], &r[3]);
2714   }
2715   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2716      store_dest(mach, &d[0], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2717   }
2718   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2719      store_dest(mach, &d[1], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2720   }
2721   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2722      store_dest(mach, &d[0], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2723   }
2724   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2725      store_dest(mach, &d[1], &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2726   }
2727}
2728
2729static void
2730exec_rfl(struct tgsi_exec_machine *mach,
2731         const struct tgsi_full_instruction *inst)
2732{
2733   union tgsi_exec_channel r[9];
2734
2735   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2736      /* r0 = dp3(src0, src0) */
2737      fetch_source(mach, &r[2], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2738      micro_mul(&r[0], &r[2], &r[2]);
2739      fetch_source(mach, &r[4], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2740      micro_mul(&r[8], &r[4], &r[4]);
2741      micro_add(&r[0], &r[0], &r[8]);
2742      fetch_source(mach, &r[6], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2743      micro_mul(&r[8], &r[6], &r[6]);
2744      micro_add(&r[0], &r[0], &r[8]);
2745
2746      /* r1 = dp3(src0, src1) */
2747      fetch_source(mach, &r[3], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2748      micro_mul(&r[1], &r[2], &r[3]);
2749      fetch_source(mach, &r[5], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2750      micro_mul(&r[8], &r[4], &r[5]);
2751      micro_add(&r[1], &r[1], &r[8]);
2752      fetch_source(mach, &r[7], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2753      micro_mul(&r[8], &r[6], &r[7]);
2754      micro_add(&r[1], &r[1], &r[8]);
2755
2756      /* r1 = 2 * r1 / r0 */
2757      micro_add(&r[1], &r[1], &r[1]);
2758      micro_div(&r[1], &r[1], &r[0]);
2759
2760      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2761         micro_mul(&r[2], &r[2], &r[1]);
2762         micro_sub(&r[2], &r[2], &r[3]);
2763         store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2764      }
2765      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2766         micro_mul(&r[4], &r[4], &r[1]);
2767         micro_sub(&r[4], &r[4], &r[5]);
2768         store_dest(mach, &r[4], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2769      }
2770      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2771         micro_mul(&r[6], &r[6], &r[1]);
2772         micro_sub(&r[6], &r[6], &r[7]);
2773         store_dest(mach, &r[6], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2774      }
2775   }
2776   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2777      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2778   }
2779}
2780
2781static void
2782exec_xpd(struct tgsi_exec_machine *mach,
2783         const struct tgsi_full_instruction *inst)
2784{
2785   union tgsi_exec_channel r[6];
2786   union tgsi_exec_channel d[3];
2787
2788   fetch_source(mach, &r[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2789   fetch_source(mach, &r[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2790
2791   micro_mul(&r[2], &r[0], &r[1]);
2792
2793   fetch_source(mach, &r[3], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2794   fetch_source(mach, &r[4], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2795
2796   micro_mul(&r[5], &r[3], &r[4] );
2797   micro_sub(&d[CHAN_X], &r[2], &r[5]);
2798
2799   fetch_source(mach, &r[2], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2800
2801   micro_mul(&r[3], &r[3], &r[2]);
2802
2803   fetch_source(mach, &r[5], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2804
2805   micro_mul(&r[1], &r[1], &r[5]);
2806   micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2807
2808   micro_mul(&r[5], &r[5], &r[4]);
2809   micro_mul(&r[0], &r[0], &r[2]);
2810   micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2811
2812   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2813      store_dest(mach, &d[CHAN_X], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2814   }
2815   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2816      store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2817   }
2818   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2819      store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2820   }
2821   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2822      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2823   }
2824}
2825
2826static void
2827exec_dst(struct tgsi_exec_machine *mach,
2828         const struct tgsi_full_instruction *inst)
2829{
2830   union tgsi_exec_channel r[2];
2831   union tgsi_exec_channel d[4];
2832
2833   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2834      fetch_source(mach, &r[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2835      fetch_source(mach, &r[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2836      micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2837   }
2838   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2839      fetch_source(mach, &d[CHAN_Z], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2840   }
2841   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2842      fetch_source(mach, &d[CHAN_W], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2843   }
2844
2845   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2846      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2847   }
2848   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2849      store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2850   }
2851   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2852      store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2853   }
2854   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2855      store_dest(mach, &d[CHAN_W], &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2856   }
2857}
2858
2859static void
2860exec_log(struct tgsi_exec_machine *mach,
2861         const struct tgsi_full_instruction *inst)
2862{
2863   union tgsi_exec_channel r[3];
2864
2865   fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2866   micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
2867   micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
2868   micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
2869   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2870      store_dest(mach, &r[0], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2871   }
2872   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2873      micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
2874      micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
2875      store_dest(mach, &r[0], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2876   }
2877   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2878      store_dest(mach, &r[1], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2879   }
2880   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2881      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2882   }
2883}
2884
2885static void
2886exec_exp(struct tgsi_exec_machine *mach,
2887         const struct tgsi_full_instruction *inst)
2888{
2889   union tgsi_exec_channel r[3];
2890
2891   fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2892   micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
2893   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2894      micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
2895      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2896   }
2897   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2898      micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
2899      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2900   }
2901   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2902      micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
2903      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2904   }
2905   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2906      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2907   }
2908}
2909
2910static void
2911exec_lit(struct tgsi_exec_machine *mach,
2912         const struct tgsi_full_instruction *inst)
2913{
2914   union tgsi_exec_channel r[3];
2915   union tgsi_exec_channel d[3];
2916
2917   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2918      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2919   }
2920   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
2921      fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2922      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2923         micro_max(&d[CHAN_Y], &r[0], &ZeroVec);
2924         store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2925      }
2926
2927      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2928         fetch_source(mach, &r[1], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2929         micro_max(&r[1], &r[1], &ZeroVec);
2930
2931         fetch_source(mach, &r[2], &inst->Src[0], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2932         micro_min(&r[2], &r[2], &P128Vec);
2933         micro_max(&r[2], &r[2], &M128Vec);
2934         micro_pow(&r[1], &r[1], &r[2]);
2935         micro_lt(&d[CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
2936         store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2937      }
2938   }
2939   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2940      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2941   }
2942}
2943
2944static void
2945exec_break(struct tgsi_exec_machine *mach)
2946{
2947   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
2948      /* turn off loop channels for each enabled exec channel */
2949      mach->LoopMask &= ~mach->ExecMask;
2950      /* Todo: if mach->LoopMask == 0, jump to end of loop */
2951      UPDATE_EXEC_MASK(mach);
2952   } else {
2953      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
2954
2955      mach->Switch.mask = 0x0;
2956
2957      UPDATE_EXEC_MASK(mach);
2958   }
2959}
2960
2961static void
2962exec_switch(struct tgsi_exec_machine *mach,
2963            const struct tgsi_full_instruction *inst)
2964{
2965   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
2966   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
2967
2968   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
2969   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2970   mach->Switch.mask = 0x0;
2971   mach->Switch.defaultMask = 0x0;
2972
2973   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
2974   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
2975
2976   UPDATE_EXEC_MASK(mach);
2977}
2978
2979static void
2980exec_case(struct tgsi_exec_machine *mach,
2981          const struct tgsi_full_instruction *inst)
2982{
2983   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2984   union tgsi_exec_channel src;
2985   uint mask = 0;
2986
2987   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2988
2989   if (mach->Switch.selector.u[0] == src.u[0]) {
2990      mask |= 0x1;
2991   }
2992   if (mach->Switch.selector.u[1] == src.u[1]) {
2993      mask |= 0x2;
2994   }
2995   if (mach->Switch.selector.u[2] == src.u[2]) {
2996      mask |= 0x4;
2997   }
2998   if (mach->Switch.selector.u[3] == src.u[3]) {
2999      mask |= 0x8;
3000   }
3001
3002   mach->Switch.defaultMask |= mask;
3003
3004   mach->Switch.mask |= mask & prevMask;
3005
3006   UPDATE_EXEC_MASK(mach);
3007}
3008
3009static void
3010exec_default(struct tgsi_exec_machine *mach)
3011{
3012   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3013
3014   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
3015
3016   UPDATE_EXEC_MASK(mach);
3017}
3018
3019static void
3020exec_endswitch(struct tgsi_exec_machine *mach)
3021{
3022   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
3023   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3024
3025   UPDATE_EXEC_MASK(mach);
3026}
3027
3028static void
3029micro_i2f(union tgsi_exec_channel *dst,
3030          const union tgsi_exec_channel *src)
3031{
3032   dst->f[0] = (float)src->i[0];
3033   dst->f[1] = (float)src->i[1];
3034   dst->f[2] = (float)src->i[2];
3035   dst->f[3] = (float)src->i[3];
3036}
3037
3038static void
3039micro_not(union tgsi_exec_channel *dst,
3040          const union tgsi_exec_channel *src)
3041{
3042   dst->u[0] = ~src->u[0];
3043   dst->u[1] = ~src->u[1];
3044   dst->u[2] = ~src->u[2];
3045   dst->u[3] = ~src->u[3];
3046}
3047
3048static void
3049micro_shl(union tgsi_exec_channel *dst,
3050          const union tgsi_exec_channel *src0,
3051          const union tgsi_exec_channel *src1)
3052{
3053   dst->u[0] = src0->u[0] << src1->u[0];
3054   dst->u[1] = src0->u[1] << src1->u[1];
3055   dst->u[2] = src0->u[2] << src1->u[2];
3056   dst->u[3] = src0->u[3] << src1->u[3];
3057}
3058
3059static void
3060micro_and(union tgsi_exec_channel *dst,
3061          const union tgsi_exec_channel *src0,
3062          const union tgsi_exec_channel *src1)
3063{
3064   dst->u[0] = src0->u[0] & src1->u[0];
3065   dst->u[1] = src0->u[1] & src1->u[1];
3066   dst->u[2] = src0->u[2] & src1->u[2];
3067   dst->u[3] = src0->u[3] & src1->u[3];
3068}
3069
3070static void
3071micro_or(union tgsi_exec_channel *dst,
3072         const union tgsi_exec_channel *src0,
3073         const union tgsi_exec_channel *src1)
3074{
3075   dst->u[0] = src0->u[0] | src1->u[0];
3076   dst->u[1] = src0->u[1] | src1->u[1];
3077   dst->u[2] = src0->u[2] | src1->u[2];
3078   dst->u[3] = src0->u[3] | src1->u[3];
3079}
3080
3081static void
3082micro_xor(union tgsi_exec_channel *dst,
3083          const union tgsi_exec_channel *src0,
3084          const union tgsi_exec_channel *src1)
3085{
3086   dst->u[0] = src0->u[0] ^ src1->u[0];
3087   dst->u[1] = src0->u[1] ^ src1->u[1];
3088   dst->u[2] = src0->u[2] ^ src1->u[2];
3089   dst->u[3] = src0->u[3] ^ src1->u[3];
3090}
3091
3092static void
3093micro_mod(union tgsi_exec_channel *dst,
3094          const union tgsi_exec_channel *src0,
3095          const union tgsi_exec_channel *src1)
3096{
3097   dst->i[0] = src0->i[0] % src1->i[0];
3098   dst->i[1] = src0->i[1] % src1->i[1];
3099   dst->i[2] = src0->i[2] % src1->i[2];
3100   dst->i[3] = src0->i[3] % src1->i[3];
3101}
3102
3103static void
3104micro_f2i(union tgsi_exec_channel *dst,
3105          const union tgsi_exec_channel *src)
3106{
3107   dst->i[0] = (int)src->f[0];
3108   dst->i[1] = (int)src->f[1];
3109   dst->i[2] = (int)src->f[2];
3110   dst->i[3] = (int)src->f[3];
3111}
3112
3113static void
3114micro_idiv(union tgsi_exec_channel *dst,
3115           const union tgsi_exec_channel *src0,
3116           const union tgsi_exec_channel *src1)
3117{
3118   dst->i[0] = src0->i[0] / src1->i[0];
3119   dst->i[1] = src0->i[1] / src1->i[1];
3120   dst->i[2] = src0->i[2] / src1->i[2];
3121   dst->i[3] = src0->i[3] / src1->i[3];
3122}
3123
3124static void
3125micro_imax(union tgsi_exec_channel *dst,
3126           const union tgsi_exec_channel *src0,
3127           const union tgsi_exec_channel *src1)
3128{
3129   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
3130   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
3131   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
3132   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
3133}
3134
3135static void
3136micro_imin(union tgsi_exec_channel *dst,
3137           const union tgsi_exec_channel *src0,
3138           const union tgsi_exec_channel *src1)
3139{
3140   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
3141   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
3142   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
3143   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
3144}
3145
3146static void
3147micro_isge(union tgsi_exec_channel *dst,
3148           const union tgsi_exec_channel *src0,
3149           const union tgsi_exec_channel *src1)
3150{
3151   dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
3152   dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
3153   dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
3154   dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
3155}
3156
3157static void
3158micro_ishr(union tgsi_exec_channel *dst,
3159           const union tgsi_exec_channel *src0,
3160           const union tgsi_exec_channel *src1)
3161{
3162   dst->i[0] = src0->i[0] >> src1->i[0];
3163   dst->i[1] = src0->i[1] >> src1->i[1];
3164   dst->i[2] = src0->i[2] >> src1->i[2];
3165   dst->i[3] = src0->i[3] >> src1->i[3];
3166}
3167
3168static void
3169micro_islt(union tgsi_exec_channel *dst,
3170           const union tgsi_exec_channel *src0,
3171           const union tgsi_exec_channel *src1)
3172{
3173   dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
3174   dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
3175   dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
3176   dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
3177}
3178
3179static void
3180micro_f2u(union tgsi_exec_channel *dst,
3181          const union tgsi_exec_channel *src)
3182{
3183   dst->u[0] = (uint)src->f[0];
3184   dst->u[1] = (uint)src->f[1];
3185   dst->u[2] = (uint)src->f[2];
3186   dst->u[3] = (uint)src->f[3];
3187}
3188
3189static void
3190micro_u2f(union tgsi_exec_channel *dst,
3191          const union tgsi_exec_channel *src)
3192{
3193   dst->f[0] = (float)src->u[0];
3194   dst->f[1] = (float)src->u[1];
3195   dst->f[2] = (float)src->u[2];
3196   dst->f[3] = (float)src->u[3];
3197}
3198
3199static void
3200micro_uadd(union tgsi_exec_channel *dst,
3201           const union tgsi_exec_channel *src0,
3202           const union tgsi_exec_channel *src1)
3203{
3204   dst->u[0] = src0->u[0] + src1->u[0];
3205   dst->u[1] = src0->u[1] + src1->u[1];
3206   dst->u[2] = src0->u[2] + src1->u[2];
3207   dst->u[3] = src0->u[3] + src1->u[3];
3208}
3209
3210static void
3211micro_udiv(union tgsi_exec_channel *dst,
3212           const union tgsi_exec_channel *src0,
3213           const union tgsi_exec_channel *src1)
3214{
3215   dst->u[0] = src0->u[0] / src1->u[0];
3216   dst->u[1] = src0->u[1] / src1->u[1];
3217   dst->u[2] = src0->u[2] / src1->u[2];
3218   dst->u[3] = src0->u[3] / src1->u[3];
3219}
3220
3221static void
3222micro_umad(union tgsi_exec_channel *dst,
3223           const union tgsi_exec_channel *src0,
3224           const union tgsi_exec_channel *src1,
3225           const union tgsi_exec_channel *src2)
3226{
3227   dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
3228   dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
3229   dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
3230   dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
3231}
3232
3233static void
3234micro_umax(union tgsi_exec_channel *dst,
3235           const union tgsi_exec_channel *src0,
3236           const union tgsi_exec_channel *src1)
3237{
3238   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
3239   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
3240   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
3241   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
3242}
3243
3244static void
3245micro_umin(union tgsi_exec_channel *dst,
3246           const union tgsi_exec_channel *src0,
3247           const union tgsi_exec_channel *src1)
3248{
3249   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
3250   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
3251   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
3252   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
3253}
3254
3255static void
3256micro_umod(union tgsi_exec_channel *dst,
3257           const union tgsi_exec_channel *src0,
3258           const union tgsi_exec_channel *src1)
3259{
3260   dst->u[0] = src0->u[0] % src1->u[0];
3261   dst->u[1] = src0->u[1] % src1->u[1];
3262   dst->u[2] = src0->u[2] % src1->u[2];
3263   dst->u[3] = src0->u[3] % src1->u[3];
3264}
3265
3266static void
3267micro_umul(union tgsi_exec_channel *dst,
3268           const union tgsi_exec_channel *src0,
3269           const union tgsi_exec_channel *src1)
3270{
3271   dst->u[0] = src0->u[0] * src1->u[0];
3272   dst->u[1] = src0->u[1] * src1->u[1];
3273   dst->u[2] = src0->u[2] * src1->u[2];
3274   dst->u[3] = src0->u[3] * src1->u[3];
3275}
3276
3277static void
3278micro_useq(union tgsi_exec_channel *dst,
3279           const union tgsi_exec_channel *src0,
3280           const union tgsi_exec_channel *src1)
3281{
3282   dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
3283   dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
3284   dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
3285   dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
3286}
3287
3288static void
3289micro_usge(union tgsi_exec_channel *dst,
3290           const union tgsi_exec_channel *src0,
3291           const union tgsi_exec_channel *src1)
3292{
3293   dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
3294   dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
3295   dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
3296   dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
3297}
3298
3299static void
3300micro_ushr(union tgsi_exec_channel *dst,
3301           const union tgsi_exec_channel *src0,
3302           const union tgsi_exec_channel *src1)
3303{
3304   dst->u[0] = src0->u[0] >> src1->u[0];
3305   dst->u[1] = src0->u[1] >> src1->u[1];
3306   dst->u[2] = src0->u[2] >> src1->u[2];
3307   dst->u[3] = src0->u[3] >> src1->u[3];
3308}
3309
3310static void
3311micro_uslt(union tgsi_exec_channel *dst,
3312           const union tgsi_exec_channel *src0,
3313           const union tgsi_exec_channel *src1)
3314{
3315   dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
3316   dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
3317   dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
3318   dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
3319}
3320
3321static void
3322micro_usne(union tgsi_exec_channel *dst,
3323           const union tgsi_exec_channel *src0,
3324           const union tgsi_exec_channel *src1)
3325{
3326   dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
3327   dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
3328   dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
3329   dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
3330}
3331
3332static void
3333exec_instruction(
3334   struct tgsi_exec_machine *mach,
3335   const struct tgsi_full_instruction *inst,
3336   int *pc )
3337{
3338   union tgsi_exec_channel r[10];
3339
3340   (*pc)++;
3341
3342   switch (inst->Instruction.Opcode) {
3343   case TGSI_OPCODE_ARL:
3344      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3345      break;
3346
3347   case TGSI_OPCODE_MOV:
3348      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3349      break;
3350
3351   case TGSI_OPCODE_LIT:
3352      exec_lit(mach, inst);
3353      break;
3354
3355   case TGSI_OPCODE_RCP:
3356      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3357      break;
3358
3359   case TGSI_OPCODE_RSQ:
3360      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3361      break;
3362
3363   case TGSI_OPCODE_EXP:
3364      exec_exp(mach, inst);
3365      break;
3366
3367   case TGSI_OPCODE_LOG:
3368      exec_log(mach, inst);
3369      break;
3370
3371   case TGSI_OPCODE_MUL:
3372      exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3373      break;
3374
3375   case TGSI_OPCODE_ADD:
3376      exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3377      break;
3378
3379   case TGSI_OPCODE_DP3:
3380      exec_dp3(mach, inst);
3381      break;
3382
3383   case TGSI_OPCODE_DP4:
3384      exec_dp4(mach, inst);
3385      break;
3386
3387   case TGSI_OPCODE_DST:
3388      exec_dst(mach, inst);
3389      break;
3390
3391   case TGSI_OPCODE_MIN:
3392      exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3393      break;
3394
3395   case TGSI_OPCODE_MAX:
3396      exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3397      break;
3398
3399   case TGSI_OPCODE_SLT:
3400      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3401      break;
3402
3403   case TGSI_OPCODE_SGE:
3404      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3405      break;
3406
3407   case TGSI_OPCODE_MAD:
3408      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3409      break;
3410
3411   case TGSI_OPCODE_SUB:
3412      exec_vector_binary(mach, inst, micro_sub, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3413      break;
3414
3415   case TGSI_OPCODE_LRP:
3416      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3417      break;
3418
3419   case TGSI_OPCODE_CND:
3420      exec_vector_trinary(mach, inst, micro_cnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3421      break;
3422
3423   case TGSI_OPCODE_DP2A:
3424      exec_dp2a(mach, inst);
3425      break;
3426
3427   case TGSI_OPCODE_FRC:
3428      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3429      break;
3430
3431   case TGSI_OPCODE_CLAMP:
3432      exec_vector_trinary(mach, inst, micro_clamp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3433      break;
3434
3435   case TGSI_OPCODE_FLR:
3436      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3437      break;
3438
3439   case TGSI_OPCODE_ROUND:
3440      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3441      break;
3442
3443   case TGSI_OPCODE_EX2:
3444      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3445      break;
3446
3447   case TGSI_OPCODE_LG2:
3448      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3449      break;
3450
3451   case TGSI_OPCODE_POW:
3452      exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3453      break;
3454
3455   case TGSI_OPCODE_XPD:
3456      exec_xpd(mach, inst);
3457      break;
3458
3459   case TGSI_OPCODE_ABS:
3460      exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3461      break;
3462
3463   case TGSI_OPCODE_RCC:
3464      exec_scalar_unary(mach, inst, micro_rcc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3465      break;
3466
3467   case TGSI_OPCODE_DPH:
3468      exec_dph(mach, inst);
3469      break;
3470
3471   case TGSI_OPCODE_COS:
3472      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3473      break;
3474
3475   case TGSI_OPCODE_DDX:
3476      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3477      break;
3478
3479   case TGSI_OPCODE_DDY:
3480      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3481      break;
3482
3483   case TGSI_OPCODE_KILP:
3484      exec_kilp (mach, inst);
3485      break;
3486
3487   case TGSI_OPCODE_KIL:
3488      exec_kil (mach, inst);
3489      break;
3490
3491   case TGSI_OPCODE_PK2H:
3492      assert (0);
3493      break;
3494
3495   case TGSI_OPCODE_PK2US:
3496      assert (0);
3497      break;
3498
3499   case TGSI_OPCODE_PK4B:
3500      assert (0);
3501      break;
3502
3503   case TGSI_OPCODE_PK4UB:
3504      assert (0);
3505      break;
3506
3507   case TGSI_OPCODE_RFL:
3508      exec_rfl(mach, inst);
3509      break;
3510
3511   case TGSI_OPCODE_SEQ:
3512      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3513      break;
3514
3515   case TGSI_OPCODE_SFL:
3516      exec_vector(mach, inst, micro_sfl, TGSI_EXEC_DATA_FLOAT);
3517      break;
3518
3519   case TGSI_OPCODE_SGT:
3520      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3521      break;
3522
3523   case TGSI_OPCODE_SIN:
3524      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3525      break;
3526
3527   case TGSI_OPCODE_SLE:
3528      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3529      break;
3530
3531   case TGSI_OPCODE_SNE:
3532      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3533      break;
3534
3535   case TGSI_OPCODE_STR:
3536      exec_vector(mach, inst, micro_str, TGSI_EXEC_DATA_FLOAT);
3537      break;
3538
3539   case TGSI_OPCODE_TEX:
3540      /* simple texture lookup */
3541      /* src[0] = texcoord */
3542      /* src[1] = sampler unit */
3543      exec_tex(mach, inst, TEX_MODIFIER_NONE);
3544      break;
3545
3546   case TGSI_OPCODE_TXB:
3547      /* Texture lookup with lod bias */
3548      /* src[0] = texcoord (src[0].w = LOD bias) */
3549      /* src[1] = sampler unit */
3550      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
3551      break;
3552
3553   case TGSI_OPCODE_TXD:
3554      /* Texture lookup with explict partial derivatives */
3555      /* src[0] = texcoord */
3556      /* src[1] = d[strq]/dx */
3557      /* src[2] = d[strq]/dy */
3558      /* src[3] = sampler unit */
3559      exec_txd(mach, inst);
3560      break;
3561
3562   case TGSI_OPCODE_TXL:
3563      /* Texture lookup with explit LOD */
3564      /* src[0] = texcoord (src[0].w = LOD) */
3565      /* src[1] = sampler unit */
3566      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
3567      break;
3568
3569   case TGSI_OPCODE_TXP:
3570      /* Texture lookup with projection */
3571      /* src[0] = texcoord (src[0].w = projection) */
3572      /* src[1] = sampler unit */
3573      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
3574      break;
3575
3576   case TGSI_OPCODE_UP2H:
3577      assert (0);
3578      break;
3579
3580   case TGSI_OPCODE_UP2US:
3581      assert (0);
3582      break;
3583
3584   case TGSI_OPCODE_UP4B:
3585      assert (0);
3586      break;
3587
3588   case TGSI_OPCODE_UP4UB:
3589      assert (0);
3590      break;
3591
3592   case TGSI_OPCODE_X2D:
3593      exec_x2d(mach, inst);
3594      break;
3595
3596   case TGSI_OPCODE_ARA:
3597      assert (0);
3598      break;
3599
3600   case TGSI_OPCODE_ARR:
3601      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3602      break;
3603
3604   case TGSI_OPCODE_BRA:
3605      assert (0);
3606      break;
3607
3608   case TGSI_OPCODE_CAL:
3609      /* skip the call if no execution channels are enabled */
3610      if (mach->ExecMask) {
3611         /* do the call */
3612
3613         /* First, record the depths of the execution stacks.
3614          * This is important for deeply nested/looped return statements.
3615          * We have to unwind the stacks by the correct amount.  For a
3616          * real code generator, we could determine the number of entries
3617          * to pop off each stack with simple static analysis and avoid
3618          * implementing this data structure at run time.
3619          */
3620         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
3621         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
3622         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
3623         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
3624         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
3625         /* note that PC was already incremented above */
3626         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
3627
3628         mach->CallStackTop++;
3629
3630         /* Second, push the Cond, Loop, Cont, Func stacks */
3631         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3632         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3633         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3634         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3635         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3636         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
3637
3638         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3639         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3640         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3641         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3642         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3643         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
3644
3645         /* Finally, jump to the subroutine */
3646         *pc = inst->Label.Label;
3647      }
3648      break;
3649
3650   case TGSI_OPCODE_RET:
3651      mach->FuncMask &= ~mach->ExecMask;
3652      UPDATE_EXEC_MASK(mach);
3653
3654      if (mach->FuncMask == 0x0) {
3655         /* really return now (otherwise, keep executing */
3656
3657         if (mach->CallStackTop == 0) {
3658            /* returning from main() */
3659            mach->CondStackTop = 0;
3660            mach->LoopStackTop = 0;
3661            *pc = -1;
3662            return;
3663         }
3664
3665         assert(mach->CallStackTop > 0);
3666         mach->CallStackTop--;
3667
3668         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3669         mach->CondMask = mach->CondStack[mach->CondStackTop];
3670
3671         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3672         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3673
3674         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3675         mach->ContMask = mach->ContStack[mach->ContStackTop];
3676
3677         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3678         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3679
3680         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3681         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3682
3683         assert(mach->FuncStackTop > 0);
3684         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3685
3686         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3687
3688         UPDATE_EXEC_MASK(mach);
3689      }
3690      break;
3691
3692   case TGSI_OPCODE_SSG:
3693      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3694      break;
3695
3696   case TGSI_OPCODE_CMP:
3697      exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3698      break;
3699
3700   case TGSI_OPCODE_SCS:
3701      exec_scs(mach, inst);
3702      break;
3703
3704   case TGSI_OPCODE_NRM:
3705      exec_nrm3(mach, inst);
3706      break;
3707
3708   case TGSI_OPCODE_NRM4:
3709      exec_nrm4(mach, inst);
3710      break;
3711
3712   case TGSI_OPCODE_DIV:
3713      exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3714      break;
3715
3716   case TGSI_OPCODE_DP2:
3717      exec_dp2(mach, inst);
3718      break;
3719
3720   case TGSI_OPCODE_IF:
3721      /* push CondMask */
3722      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3723      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3724      FETCH( &r[0], 0, CHAN_X );
3725      /* update CondMask */
3726      if( ! r[0].u[0] ) {
3727         mach->CondMask &= ~0x1;
3728      }
3729      if( ! r[0].u[1] ) {
3730         mach->CondMask &= ~0x2;
3731      }
3732      if( ! r[0].u[2] ) {
3733         mach->CondMask &= ~0x4;
3734      }
3735      if( ! r[0].u[3] ) {
3736         mach->CondMask &= ~0x8;
3737      }
3738      UPDATE_EXEC_MASK(mach);
3739      /* Todo: If CondMask==0, jump to ELSE */
3740      break;
3741
3742   case TGSI_OPCODE_ELSE:
3743      /* invert CondMask wrt previous mask */
3744      {
3745         uint prevMask;
3746         assert(mach->CondStackTop > 0);
3747         prevMask = mach->CondStack[mach->CondStackTop - 1];
3748         mach->CondMask = ~mach->CondMask & prevMask;
3749         UPDATE_EXEC_MASK(mach);
3750         /* Todo: If CondMask==0, jump to ENDIF */
3751      }
3752      break;
3753
3754   case TGSI_OPCODE_ENDIF:
3755      /* pop CondMask */
3756      assert(mach->CondStackTop > 0);
3757      mach->CondMask = mach->CondStack[--mach->CondStackTop];
3758      UPDATE_EXEC_MASK(mach);
3759      break;
3760
3761   case TGSI_OPCODE_END:
3762      /* make sure we end primitives which haven't
3763       * been explicitly emitted */
3764      conditional_emit_primitive(mach);
3765      /* halt execution */
3766      *pc = -1;
3767      break;
3768
3769   case TGSI_OPCODE_PUSHA:
3770      assert (0);
3771      break;
3772
3773   case TGSI_OPCODE_POPA:
3774      assert (0);
3775      break;
3776
3777   case TGSI_OPCODE_CEIL:
3778      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3779      break;
3780
3781   case TGSI_OPCODE_I2F:
3782      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
3783      break;
3784
3785   case TGSI_OPCODE_NOT:
3786      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3787      break;
3788
3789   case TGSI_OPCODE_TRUNC:
3790      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3791      break;
3792
3793   case TGSI_OPCODE_SHL:
3794      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3795      break;
3796
3797   case TGSI_OPCODE_AND:
3798      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3799      break;
3800
3801   case TGSI_OPCODE_OR:
3802      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3803      break;
3804
3805   case TGSI_OPCODE_MOD:
3806      exec_vector_binary(mach, inst, micro_mod, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3807      break;
3808
3809   case TGSI_OPCODE_XOR:
3810      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3811      break;
3812
3813   case TGSI_OPCODE_SAD:
3814      assert (0);
3815      break;
3816
3817   case TGSI_OPCODE_TXF:
3818      exec_txf(mach, inst);
3819      break;
3820
3821   case TGSI_OPCODE_TXQ:
3822      exec_txq(mach, inst);
3823      break;
3824
3825   case TGSI_OPCODE_EMIT:
3826      emit_vertex(mach);
3827      break;
3828
3829   case TGSI_OPCODE_ENDPRIM:
3830      emit_primitive(mach);
3831      break;
3832
3833   case TGSI_OPCODE_BGNLOOP:
3834      /* push LoopMask and ContMasks */
3835      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3836      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3837      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3838      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3839
3840      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3841      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3842      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3843      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3844      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
3845      break;
3846
3847   case TGSI_OPCODE_ENDLOOP:
3848      /* Restore ContMask, but don't pop */
3849      assert(mach->ContStackTop > 0);
3850      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3851      UPDATE_EXEC_MASK(mach);
3852      if (mach->ExecMask) {
3853         /* repeat loop: jump to instruction just past BGNLOOP */
3854         assert(mach->LoopLabelStackTop > 0);
3855         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3856      }
3857      else {
3858         /* exit loop: pop LoopMask */
3859         assert(mach->LoopStackTop > 0);
3860         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3861         /* pop ContMask */
3862         assert(mach->ContStackTop > 0);
3863         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3864         assert(mach->LoopLabelStackTop > 0);
3865         --mach->LoopLabelStackTop;
3866
3867         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3868      }
3869      UPDATE_EXEC_MASK(mach);
3870      break;
3871
3872   case TGSI_OPCODE_BRK:
3873      exec_break(mach);
3874      break;
3875
3876   case TGSI_OPCODE_CONT:
3877      /* turn off cont channels for each enabled exec channel */
3878      mach->ContMask &= ~mach->ExecMask;
3879      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3880      UPDATE_EXEC_MASK(mach);
3881      break;
3882
3883   case TGSI_OPCODE_BGNSUB:
3884      /* no-op */
3885      break;
3886
3887   case TGSI_OPCODE_ENDSUB:
3888      /*
3889       * XXX: This really should be a no-op. We should never reach this opcode.
3890       */
3891
3892      assert(mach->CallStackTop > 0);
3893      mach->CallStackTop--;
3894
3895      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3896      mach->CondMask = mach->CondStack[mach->CondStackTop];
3897
3898      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3899      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3900
3901      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3902      mach->ContMask = mach->ContStack[mach->ContStackTop];
3903
3904      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3905      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3906
3907      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3908      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3909
3910      assert(mach->FuncStackTop > 0);
3911      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3912
3913      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3914
3915      UPDATE_EXEC_MASK(mach);
3916      break;
3917
3918   case TGSI_OPCODE_NOP:
3919      break;
3920
3921   case TGSI_OPCODE_BREAKC:
3922      FETCH(&r[0], 0, CHAN_X);
3923      /* update CondMask */
3924      if (r[0].u[0] && (mach->ExecMask & 0x1)) {
3925         mach->LoopMask &= ~0x1;
3926      }
3927      if (r[0].u[1] && (mach->ExecMask & 0x2)) {
3928         mach->LoopMask &= ~0x2;
3929      }
3930      if (r[0].u[2] && (mach->ExecMask & 0x4)) {
3931         mach->LoopMask &= ~0x4;
3932      }
3933      if (r[0].u[3] && (mach->ExecMask & 0x8)) {
3934         mach->LoopMask &= ~0x8;
3935      }
3936      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3937      UPDATE_EXEC_MASK(mach);
3938      break;
3939
3940   case TGSI_OPCODE_F2I:
3941      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3942      break;
3943
3944   case TGSI_OPCODE_IDIV:
3945      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3946      break;
3947
3948   case TGSI_OPCODE_IMAX:
3949      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3950      break;
3951
3952   case TGSI_OPCODE_IMIN:
3953      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3954      break;
3955
3956   case TGSI_OPCODE_INEG:
3957      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3958      break;
3959
3960   case TGSI_OPCODE_ISGE:
3961      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3962      break;
3963
3964   case TGSI_OPCODE_ISHR:
3965      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3966      break;
3967
3968   case TGSI_OPCODE_ISLT:
3969      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3970      break;
3971
3972   case TGSI_OPCODE_F2U:
3973      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3974      break;
3975
3976   case TGSI_OPCODE_U2F:
3977      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
3978      break;
3979
3980   case TGSI_OPCODE_UADD:
3981      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3982      break;
3983
3984   case TGSI_OPCODE_UDIV:
3985      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3986      break;
3987
3988   case TGSI_OPCODE_UMAD:
3989      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3990      break;
3991
3992   case TGSI_OPCODE_UMAX:
3993      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3994      break;
3995
3996   case TGSI_OPCODE_UMIN:
3997      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3998      break;
3999
4000   case TGSI_OPCODE_UMOD:
4001      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4002      break;
4003
4004   case TGSI_OPCODE_UMUL:
4005      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4006      break;
4007
4008   case TGSI_OPCODE_USEQ:
4009      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4010      break;
4011
4012   case TGSI_OPCODE_USGE:
4013      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4014      break;
4015
4016   case TGSI_OPCODE_USHR:
4017      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4018      break;
4019
4020   case TGSI_OPCODE_USLT:
4021      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4022      break;
4023
4024   case TGSI_OPCODE_USNE:
4025      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4026      break;
4027
4028   case TGSI_OPCODE_SWITCH:
4029      exec_switch(mach, inst);
4030      break;
4031
4032   case TGSI_OPCODE_CASE:
4033      exec_case(mach, inst);
4034      break;
4035
4036   case TGSI_OPCODE_DEFAULT:
4037      exec_default(mach);
4038      break;
4039
4040   case TGSI_OPCODE_ENDSWITCH:
4041      exec_endswitch(mach);
4042      break;
4043
4044   case TGSI_OPCODE_LOAD:
4045      assert(0);
4046      break;
4047
4048   case TGSI_OPCODE_LOAD_MS:
4049      assert(0);
4050      break;
4051
4052   case TGSI_OPCODE_SAMPLE:
4053      exec_sample(mach, inst, TEX_MODIFIER_NONE);
4054      break;
4055
4056   case TGSI_OPCODE_SAMPLE_B:
4057      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
4058      break;
4059
4060   case TGSI_OPCODE_SAMPLE_C:
4061      exec_sample(mach, inst, TEX_MODIFIER_NONE);
4062      break;
4063
4064   case TGSI_OPCODE_SAMPLE_C_LZ:
4065      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
4066      break;
4067
4068   case TGSI_OPCODE_SAMPLE_D:
4069      exec_sample_d(mach, inst);
4070      break;
4071
4072   case TGSI_OPCODE_SAMPLE_L:
4073      exec_sample(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
4074      break;
4075
4076   case TGSI_OPCODE_GATHER4:
4077      assert(0);
4078      break;
4079
4080   case TGSI_OPCODE_RESINFO:
4081      assert(0);
4082      break;
4083
4084   case TGSI_OPCODE_SAMPLE_POS:
4085      assert(0);
4086      break;
4087
4088   case TGSI_OPCODE_SAMPLE_INFO:
4089      assert(0);
4090      break;
4091
4092   default:
4093      assert( 0 );
4094   }
4095}
4096
4097
4098#define DEBUG_EXECUTION 0
4099
4100
4101/**
4102 * Run TGSI interpreter.
4103 * \return bitmask of "alive" quad components
4104 */
4105uint
4106tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
4107{
4108   uint i;
4109   int pc = 0;
4110
4111   mach->CondMask = 0xf;
4112   mach->LoopMask = 0xf;
4113   mach->ContMask = 0xf;
4114   mach->FuncMask = 0xf;
4115   mach->ExecMask = 0xf;
4116
4117   mach->Switch.mask = 0xf;
4118
4119   assert(mach->CondStackTop == 0);
4120   assert(mach->LoopStackTop == 0);
4121   assert(mach->ContStackTop == 0);
4122   assert(mach->SwitchStackTop == 0);
4123   assert(mach->BreakStackTop == 0);
4124   assert(mach->CallStackTop == 0);
4125
4126   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
4127   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
4128
4129   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
4130      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
4131      mach->Primitives[0] = 0;
4132   }
4133
4134   /* execute declarations (interpolants) */
4135   for (i = 0; i < mach->NumDeclarations; i++) {
4136      exec_declaration( mach, mach->Declarations+i );
4137   }
4138
4139   {
4140#if DEBUG_EXECUTION
4141      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
4142      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
4143      uint inst = 1;
4144
4145      memcpy(temps, mach->Temps, sizeof(temps));
4146      memcpy(outputs, mach->Outputs, sizeof(outputs));
4147#endif
4148
4149      /* execute instructions, until pc is set to -1 */
4150      while (pc != -1) {
4151
4152#if DEBUG_EXECUTION
4153         uint i;
4154
4155         tgsi_dump_instruction(&mach->Instructions[pc], inst++);
4156#endif
4157
4158         assert(pc < (int) mach->NumInstructions);
4159         exec_instruction(mach, mach->Instructions + pc, &pc);
4160
4161#if DEBUG_EXECUTION
4162         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
4163            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
4164               uint j;
4165
4166               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
4167               debug_printf("TEMP[%2u] = ", i);
4168               for (j = 0; j < 4; j++) {
4169                  if (j > 0) {
4170                     debug_printf("           ");
4171                  }
4172                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
4173                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
4174                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
4175                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
4176                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
4177               }
4178            }
4179         }
4180         for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
4181            if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
4182               uint j;
4183
4184               memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
4185               debug_printf("OUT[%2u] =  ", i);
4186               for (j = 0; j < 4; j++) {
4187                  if (j > 0) {
4188                     debug_printf("           ");
4189                  }
4190                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
4191                               outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
4192                               outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
4193                               outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
4194                               outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
4195               }
4196            }
4197         }
4198#endif
4199      }
4200   }
4201
4202#if 0
4203   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
4204   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
4205      /*
4206       * Scale back depth component.
4207       */
4208      for (i = 0; i < 4; i++)
4209         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
4210   }
4211#endif
4212
4213   /* Strictly speaking, these assertions aren't really needed but they
4214    * can potentially catch some bugs in the control flow code.
4215    */
4216   assert(mach->CondStackTop == 0);
4217   assert(mach->LoopStackTop == 0);
4218   assert(mach->ContStackTop == 0);
4219   assert(mach->SwitchStackTop == 0);
4220   assert(mach->BreakStackTop == 0);
4221   assert(mach->CallStackTop == 0);
4222
4223   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4224}
4225