tgsi_exec.c revision 265f55e6273aafc8e7607cd70a4b9756f7cb6bff
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29/**
30 * TGSI interpreter/executor.
31 *
32 * Flow control information:
33 *
34 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
35 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
36 * care since a condition may be true for some quad components but false
37 * for other components.
38 *
39 * We basically execute all statements (even if they're in the part of
40 * an IF/ELSE clause that's "not taken") and use a special mask to
41 * control writing to destination registers.  This is the ExecMask.
42 * See store_dest().
43 *
44 * The ExecMask is computed from three other masks (CondMask, LoopMask and
45 * ContMask) which are controlled by the flow control instructions (namely:
46 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
47 *
48 *
49 * Authors:
50 *   Michal Krol
51 *   Brian Paul
52 */
53
54#include "pipe/p_compiler.h"
55#include "pipe/p_state.h"
56#include "pipe/p_shader_tokens.h"
57#include "tgsi/tgsi_dump.h"
58#include "tgsi/tgsi_parse.h"
59#include "tgsi/tgsi_util.h"
60#include "tgsi_exec.h"
61#include "util/u_memory.h"
62#include "util/u_math.h"
63
64
65#define FAST_MATH 1
66
67#define TILE_TOP_LEFT     0
68#define TILE_TOP_RIGHT    1
69#define TILE_BOTTOM_LEFT  2
70#define TILE_BOTTOM_RIGHT 3
71
72static void
73micro_abs(union tgsi_exec_channel *dst,
74          const union tgsi_exec_channel *src)
75{
76   dst->f[0] = fabsf(src->f[0]);
77   dst->f[1] = fabsf(src->f[1]);
78   dst->f[2] = fabsf(src->f[2]);
79   dst->f[3] = fabsf(src->f[3]);
80}
81
82static void
83micro_arl(union tgsi_exec_channel *dst,
84          const union tgsi_exec_channel *src)
85{
86   dst->i[0] = (int)floorf(src->f[0]);
87   dst->i[1] = (int)floorf(src->f[1]);
88   dst->i[2] = (int)floorf(src->f[2]);
89   dst->i[3] = (int)floorf(src->f[3]);
90}
91
92static void
93micro_arr(union tgsi_exec_channel *dst,
94          const union tgsi_exec_channel *src)
95{
96   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
97   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
98   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
99   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
100}
101
102static void
103micro_ceil(union tgsi_exec_channel *dst,
104           const union tgsi_exec_channel *src)
105{
106   dst->f[0] = ceilf(src->f[0]);
107   dst->f[1] = ceilf(src->f[1]);
108   dst->f[2] = ceilf(src->f[2]);
109   dst->f[3] = ceilf(src->f[3]);
110}
111
112static void
113micro_clamp(union tgsi_exec_channel *dst,
114            const union tgsi_exec_channel *src0,
115            const union tgsi_exec_channel *src1,
116            const union tgsi_exec_channel *src2)
117{
118   dst->f[0] = src0->f[0] < src1->f[0] ? src1->f[0] : src0->f[0] > src2->f[0] ? src2->f[0] : src0->f[0];
119   dst->f[1] = src0->f[1] < src1->f[1] ? src1->f[1] : src0->f[1] > src2->f[1] ? src2->f[1] : src0->f[1];
120   dst->f[2] = src0->f[2] < src1->f[2] ? src1->f[2] : src0->f[2] > src2->f[2] ? src2->f[2] : src0->f[2];
121   dst->f[3] = src0->f[3] < src1->f[3] ? src1->f[3] : src0->f[3] > src2->f[3] ? src2->f[3] : src0->f[3];
122}
123
124static void
125micro_cmp(union tgsi_exec_channel *dst,
126          const union tgsi_exec_channel *src0,
127          const union tgsi_exec_channel *src1,
128          const union tgsi_exec_channel *src2)
129{
130   dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
131   dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
132   dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
133   dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
134}
135
136static void
137micro_cnd(union tgsi_exec_channel *dst,
138          const union tgsi_exec_channel *src0,
139          const union tgsi_exec_channel *src1,
140          const union tgsi_exec_channel *src2)
141{
142   dst->f[0] = src2->f[0] > 0.5f ? src0->f[0] : src1->f[0];
143   dst->f[1] = src2->f[1] > 0.5f ? src0->f[1] : src1->f[1];
144   dst->f[2] = src2->f[2] > 0.5f ? src0->f[2] : src1->f[2];
145   dst->f[3] = src2->f[3] > 0.5f ? src0->f[3] : src1->f[3];
146}
147
148static void
149micro_cos(union tgsi_exec_channel *dst,
150          const union tgsi_exec_channel *src)
151{
152   dst->f[0] = cosf(src->f[0]);
153   dst->f[1] = cosf(src->f[1]);
154   dst->f[2] = cosf(src->f[2]);
155   dst->f[3] = cosf(src->f[3]);
156}
157
158static void
159micro_ddx(union tgsi_exec_channel *dst,
160          const union tgsi_exec_channel *src)
161{
162   dst->f[0] =
163   dst->f[1] =
164   dst->f[2] =
165   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
166}
167
168static void
169micro_ddy(union tgsi_exec_channel *dst,
170          const union tgsi_exec_channel *src)
171{
172   dst->f[0] =
173   dst->f[1] =
174   dst->f[2] =
175   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
176}
177
178static void
179micro_exp2(union tgsi_exec_channel *dst,
180           const union tgsi_exec_channel *src)
181{
182#if FAST_MATH
183   dst->f[0] = util_fast_exp2(src->f[0]);
184   dst->f[1] = util_fast_exp2(src->f[1]);
185   dst->f[2] = util_fast_exp2(src->f[2]);
186   dst->f[3] = util_fast_exp2(src->f[3]);
187#else
188#if DEBUG
189   /* Inf is okay for this instruction, so clamp it to silence assertions. */
190   uint i;
191   union tgsi_exec_channel clamped;
192
193   for (i = 0; i < 4; i++) {
194      if (src->f[i] > 127.99999f) {
195         clamped.f[i] = 127.99999f;
196      } else if (src->f[i] < -126.99999f) {
197         clamped.f[i] = -126.99999f;
198      } else {
199         clamped.f[i] = src->f[i];
200      }
201   }
202   src = &clamped;
203#endif /* DEBUG */
204
205   dst->f[0] = powf(2.0f, src->f[0]);
206   dst->f[1] = powf(2.0f, src->f[1]);
207   dst->f[2] = powf(2.0f, src->f[2]);
208   dst->f[3] = powf(2.0f, src->f[3]);
209#endif /* FAST_MATH */
210}
211
212static void
213micro_flr(union tgsi_exec_channel *dst,
214          const union tgsi_exec_channel *src)
215{
216   dst->f[0] = floorf(src->f[0]);
217   dst->f[1] = floorf(src->f[1]);
218   dst->f[2] = floorf(src->f[2]);
219   dst->f[3] = floorf(src->f[3]);
220}
221
222static void
223micro_frc(union tgsi_exec_channel *dst,
224          const union tgsi_exec_channel *src)
225{
226   dst->f[0] = src->f[0] - floorf(src->f[0]);
227   dst->f[1] = src->f[1] - floorf(src->f[1]);
228   dst->f[2] = src->f[2] - floorf(src->f[2]);
229   dst->f[3] = src->f[3] - floorf(src->f[3]);
230}
231
232static void
233micro_iabs(union tgsi_exec_channel *dst,
234           const union tgsi_exec_channel *src)
235{
236   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
237   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
238   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
239   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
240}
241
242static void
243micro_ineg(union tgsi_exec_channel *dst,
244           const union tgsi_exec_channel *src)
245{
246   dst->i[0] = -src->i[0];
247   dst->i[1] = -src->i[1];
248   dst->i[2] = -src->i[2];
249   dst->i[3] = -src->i[3];
250}
251
252static void
253micro_lg2(union tgsi_exec_channel *dst,
254          const union tgsi_exec_channel *src)
255{
256#if FAST_MATH
257   dst->f[0] = util_fast_log2(src->f[0]);
258   dst->f[1] = util_fast_log2(src->f[1]);
259   dst->f[2] = util_fast_log2(src->f[2]);
260   dst->f[3] = util_fast_log2(src->f[3]);
261#else
262   dst->f[0] = logf(src->f[0]) * 1.442695f;
263   dst->f[1] = logf(src->f[1]) * 1.442695f;
264   dst->f[2] = logf(src->f[2]) * 1.442695f;
265   dst->f[3] = logf(src->f[3]) * 1.442695f;
266#endif
267}
268
269static void
270micro_lrp(union tgsi_exec_channel *dst,
271          const union tgsi_exec_channel *src0,
272          const union tgsi_exec_channel *src1,
273          const union tgsi_exec_channel *src2)
274{
275   dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
276   dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
277   dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
278   dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
279}
280
281static void
282micro_mad(union tgsi_exec_channel *dst,
283          const union tgsi_exec_channel *src0,
284          const union tgsi_exec_channel *src1,
285          const union tgsi_exec_channel *src2)
286{
287   dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
288   dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
289   dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
290   dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
291}
292
293static void
294micro_mov(union tgsi_exec_channel *dst,
295          const union tgsi_exec_channel *src)
296{
297   dst->u[0] = src->u[0];
298   dst->u[1] = src->u[1];
299   dst->u[2] = src->u[2];
300   dst->u[3] = src->u[3];
301}
302
303static void
304micro_rcp(union tgsi_exec_channel *dst,
305          const union tgsi_exec_channel *src)
306{
307#if 0 /* for debugging */
308   assert(src->f[0] != 0.0f);
309   assert(src->f[1] != 0.0f);
310   assert(src->f[2] != 0.0f);
311   assert(src->f[3] != 0.0f);
312#endif
313   dst->f[0] = 1.0f / src->f[0];
314   dst->f[1] = 1.0f / src->f[1];
315   dst->f[2] = 1.0f / src->f[2];
316   dst->f[3] = 1.0f / src->f[3];
317}
318
319static void
320micro_rnd(union tgsi_exec_channel *dst,
321          const union tgsi_exec_channel *src)
322{
323   dst->f[0] = floorf(src->f[0] + 0.5f);
324   dst->f[1] = floorf(src->f[1] + 0.5f);
325   dst->f[2] = floorf(src->f[2] + 0.5f);
326   dst->f[3] = floorf(src->f[3] + 0.5f);
327}
328
329static void
330micro_rsq(union tgsi_exec_channel *dst,
331          const union tgsi_exec_channel *src)
332{
333#if 0 /* for debugging */
334   assert(src->f[0] != 0.0f);
335   assert(src->f[1] != 0.0f);
336   assert(src->f[2] != 0.0f);
337   assert(src->f[3] != 0.0f);
338#endif
339   dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
340   dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
341   dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
342   dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
343}
344
345static void
346micro_seq(union tgsi_exec_channel *dst,
347          const union tgsi_exec_channel *src0,
348          const union tgsi_exec_channel *src1)
349{
350   dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
351   dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
352   dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
353   dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
354}
355
356static void
357micro_sge(union tgsi_exec_channel *dst,
358          const union tgsi_exec_channel *src0,
359          const union tgsi_exec_channel *src1)
360{
361   dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
362   dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
363   dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
364   dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
365}
366
367static void
368micro_sgn(union tgsi_exec_channel *dst,
369          const union tgsi_exec_channel *src)
370{
371   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
372   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
373   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
374   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
375}
376
377static void
378micro_sgt(union tgsi_exec_channel *dst,
379          const union tgsi_exec_channel *src0,
380          const union tgsi_exec_channel *src1)
381{
382   dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
383   dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
384   dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
385   dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
386}
387
388static void
389micro_sin(union tgsi_exec_channel *dst,
390          const union tgsi_exec_channel *src)
391{
392   dst->f[0] = sinf(src->f[0]);
393   dst->f[1] = sinf(src->f[1]);
394   dst->f[2] = sinf(src->f[2]);
395   dst->f[3] = sinf(src->f[3]);
396}
397
398static void
399micro_sle(union tgsi_exec_channel *dst,
400          const union tgsi_exec_channel *src0,
401          const union tgsi_exec_channel *src1)
402{
403   dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
404   dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
405   dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
406   dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
407}
408
409static void
410micro_slt(union tgsi_exec_channel *dst,
411          const union tgsi_exec_channel *src0,
412          const union tgsi_exec_channel *src1)
413{
414   dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
415   dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
416   dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
417   dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
418}
419
420static void
421micro_sne(union tgsi_exec_channel *dst,
422          const union tgsi_exec_channel *src0,
423          const union tgsi_exec_channel *src1)
424{
425   dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
426   dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
427   dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
428   dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
429}
430
431static void
432micro_sfl(union tgsi_exec_channel *dst)
433{
434   dst->f[0] = 0.0f;
435   dst->f[1] = 0.0f;
436   dst->f[2] = 0.0f;
437   dst->f[3] = 0.0f;
438}
439
440static void
441micro_str(union tgsi_exec_channel *dst)
442{
443   dst->f[0] = 1.0f;
444   dst->f[1] = 1.0f;
445   dst->f[2] = 1.0f;
446   dst->f[3] = 1.0f;
447}
448
449static void
450micro_trunc(union tgsi_exec_channel *dst,
451            const union tgsi_exec_channel *src)
452{
453   dst->f[0] = (float)(int)src->f[0];
454   dst->f[1] = (float)(int)src->f[1];
455   dst->f[2] = (float)(int)src->f[2];
456   dst->f[3] = (float)(int)src->f[3];
457}
458
459
460#define CHAN_X  0
461#define CHAN_Y  1
462#define CHAN_Z  2
463#define CHAN_W  3
464
465enum tgsi_exec_datatype {
466   TGSI_EXEC_DATA_FLOAT,
467   TGSI_EXEC_DATA_INT,
468   TGSI_EXEC_DATA_UINT
469};
470
471/*
472 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
473 */
474#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
475#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
476#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
477#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
478#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
479#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
480
481
482/** The execution mask depends on the conditional mask and the loop mask */
483#define UPDATE_EXEC_MASK(MACH) \
484      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
485
486
487static const union tgsi_exec_channel ZeroVec =
488   { { 0.0, 0.0, 0.0, 0.0 } };
489
490static const union tgsi_exec_channel OneVec = {
491   {1.0f, 1.0f, 1.0f, 1.0f}
492};
493
494static const union tgsi_exec_channel P128Vec = {
495   {128.0f, 128.0f, 128.0f, 128.0f}
496};
497
498static const union tgsi_exec_channel M128Vec = {
499   {-128.0f, -128.0f, -128.0f, -128.0f}
500};
501
502
503/**
504 * Assert that none of the float values in 'chan' are infinite or NaN.
505 * NaN and Inf may occur normally during program execution and should
506 * not lead to crashes, etc.  But when debugging, it's helpful to catch
507 * them.
508 */
509static INLINE void
510check_inf_or_nan(const union tgsi_exec_channel *chan)
511{
512   assert(!util_is_inf_or_nan((chan)->f[0]));
513   assert(!util_is_inf_or_nan((chan)->f[1]));
514   assert(!util_is_inf_or_nan((chan)->f[2]));
515   assert(!util_is_inf_or_nan((chan)->f[3]));
516}
517
518
519#ifdef DEBUG
520static void
521print_chan(const char *msg, const union tgsi_exec_channel *chan)
522{
523   debug_printf("%s = {%f, %f, %f, %f}\n",
524                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
525}
526#endif
527
528
529#ifdef DEBUG
530static void
531print_temp(const struct tgsi_exec_machine *mach, uint index)
532{
533   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
534   int i;
535   debug_printf("Temp[%u] =\n", index);
536   for (i = 0; i < 4; i++) {
537      debug_printf("  %c: { %f, %f, %f, %f }\n",
538                   "XYZW"[i],
539                   tmp->xyzw[i].f[0],
540                   tmp->xyzw[i].f[1],
541                   tmp->xyzw[i].f[2],
542                   tmp->xyzw[i].f[3]);
543   }
544}
545#endif
546
547
548void
549tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
550                               unsigned num_bufs,
551                               const void **bufs,
552                               const unsigned *buf_sizes)
553{
554   unsigned i;
555
556   for (i = 0; i < num_bufs; i++) {
557      mach->Consts[i] = bufs[i];
558      mach->ConstsSize[i] = buf_sizes[i];
559   }
560}
561
562
563/**
564 * Check if there's a potential src/dst register data dependency when
565 * using SOA execution.
566 * Example:
567 *   MOV T, T.yxwz;
568 * This would expand into:
569 *   MOV t0, t1;
570 *   MOV t1, t0;
571 *   MOV t2, t3;
572 *   MOV t3, t2;
573 * The second instruction will have the wrong value for t0 if executed as-is.
574 */
575boolean
576tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
577{
578   uint i, chan;
579
580   uint writemask = inst->Dst[0].Register.WriteMask;
581   if (writemask == TGSI_WRITEMASK_X ||
582       writemask == TGSI_WRITEMASK_Y ||
583       writemask == TGSI_WRITEMASK_Z ||
584       writemask == TGSI_WRITEMASK_W ||
585       writemask == TGSI_WRITEMASK_NONE) {
586      /* no chance of data dependency */
587      return FALSE;
588   }
589
590   /* loop over src regs */
591   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
592      if ((inst->Src[i].Register.File ==
593           inst->Dst[0].Register.File) &&
594          ((inst->Src[i].Register.Index ==
595            inst->Dst[0].Register.Index) ||
596           inst->Src[i].Register.Indirect ||
597           inst->Dst[0].Register.Indirect)) {
598         /* loop over dest channels */
599         uint channelsWritten = 0x0;
600         for (chan = 0; chan < NUM_CHANNELS; chan++) {
601            if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
602               /* check if we're reading a channel that's been written */
603               uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
604               if (channelsWritten & (1 << swizzle)) {
605                  return TRUE;
606               }
607
608               channelsWritten |= (1 << chan);
609            }
610         }
611      }
612   }
613   return FALSE;
614}
615
616
617/**
618 * Initialize machine state by expanding tokens to full instructions,
619 * allocating temporary storage, setting up constants, etc.
620 * After this, we can call tgsi_exec_machine_run() many times.
621 */
622void
623tgsi_exec_machine_bind_shader(
624   struct tgsi_exec_machine *mach,
625   const struct tgsi_token *tokens,
626   uint numSamplers,
627   struct tgsi_sampler **samplers)
628{
629   uint k;
630   struct tgsi_parse_context parse;
631   struct tgsi_full_instruction *instructions;
632   struct tgsi_full_declaration *declarations;
633   uint maxInstructions = 10, numInstructions = 0;
634   uint maxDeclarations = 10, numDeclarations = 0;
635
636#if 0
637   tgsi_dump(tokens, 0);
638#endif
639
640   util_init_math();
641
642   if (numSamplers) {
643      assert(samplers);
644   }
645
646   mach->Tokens = tokens;
647   mach->Samplers = samplers;
648
649   if (!tokens) {
650      /* unbind and free all */
651      if (mach->Declarations) {
652         FREE( mach->Declarations );
653      }
654      mach->Declarations = NULL;
655      mach->NumDeclarations = 0;
656
657      if (mach->Instructions) {
658         FREE( mach->Instructions );
659      }
660      mach->Instructions = NULL;
661      mach->NumInstructions = 0;
662
663      return;
664   }
665
666   k = tgsi_parse_init (&parse, mach->Tokens);
667   if (k != TGSI_PARSE_OK) {
668      debug_printf( "Problem parsing!\n" );
669      return;
670   }
671
672   mach->Processor = parse.FullHeader.Processor.Processor;
673   mach->ImmLimit = 0;
674
675   if (mach->Processor == TGSI_PROCESSOR_GEOMETRY &&
676       !mach->UsedGeometryShader) {
677      struct tgsi_exec_vector *inputs;
678      struct tgsi_exec_vector *outputs;
679
680      inputs = align_malloc(sizeof(struct tgsi_exec_vector) *
681                            TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS,
682                            16);
683
684      if (!inputs)
685         return;
686
687      outputs = align_malloc(sizeof(struct tgsi_exec_vector) *
688                             TGSI_MAX_TOTAL_VERTICES, 16);
689
690      if (!outputs) {
691         align_free(inputs);
692         return;
693      }
694
695      align_free(mach->Inputs);
696      align_free(mach->Outputs);
697
698      mach->Inputs = inputs;
699      mach->Outputs = outputs;
700      mach->UsedGeometryShader = TRUE;
701   }
702
703   declarations = (struct tgsi_full_declaration *)
704      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
705
706   if (!declarations) {
707      return;
708   }
709
710   instructions = (struct tgsi_full_instruction *)
711      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
712
713   if (!instructions) {
714      FREE( declarations );
715      return;
716   }
717
718   while( !tgsi_parse_end_of_tokens( &parse ) ) {
719      uint i;
720
721      tgsi_parse_token( &parse );
722      switch( parse.FullToken.Token.Type ) {
723      case TGSI_TOKEN_TYPE_DECLARATION:
724         /* save expanded declaration */
725         if (numDeclarations == maxDeclarations) {
726            declarations = REALLOC(declarations,
727                                   maxDeclarations
728                                   * sizeof(struct tgsi_full_declaration),
729                                   (maxDeclarations + 10)
730                                   * sizeof(struct tgsi_full_declaration));
731            maxDeclarations += 10;
732         }
733         if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
734            unsigned reg;
735            for (reg = parse.FullToken.FullDeclaration.Range.First;
736                 reg <= parse.FullToken.FullDeclaration.Range.Last;
737                 ++reg) {
738               ++mach->NumOutputs;
739            }
740         }
741         if (parse.FullToken.FullDeclaration.Declaration.File ==
742             TGSI_FILE_IMMEDIATE_ARRAY) {
743            unsigned reg;
744            struct tgsi_full_declaration *decl =
745               &parse.FullToken.FullDeclaration;
746            debug_assert(decl->Range.Last < TGSI_EXEC_NUM_IMMEDIATES);
747            for (reg = decl->Range.First; reg <= decl->Range.Last; ++reg) {
748               for( i = 0; i < 4; i++ ) {
749                  int idx = reg * 4 + i;
750                  mach->ImmArray[reg][i] = decl->ImmediateData.u[idx].Float;
751               }
752            }
753         }
754         memcpy(declarations + numDeclarations,
755                &parse.FullToken.FullDeclaration,
756                sizeof(declarations[0]));
757         numDeclarations++;
758         break;
759
760      case TGSI_TOKEN_TYPE_IMMEDIATE:
761         {
762            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
763            assert( size <= 4 );
764            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
765
766            for( i = 0; i < size; i++ ) {
767               mach->Imms[mach->ImmLimit][i] =
768		  parse.FullToken.FullImmediate.u[i].Float;
769            }
770            mach->ImmLimit += 1;
771         }
772         break;
773
774      case TGSI_TOKEN_TYPE_INSTRUCTION:
775
776         /* save expanded instruction */
777         if (numInstructions == maxInstructions) {
778            instructions = REALLOC(instructions,
779                                   maxInstructions
780                                   * sizeof(struct tgsi_full_instruction),
781                                   (maxInstructions + 10)
782                                   * sizeof(struct tgsi_full_instruction));
783            maxInstructions += 10;
784         }
785
786         memcpy(instructions + numInstructions,
787                &parse.FullToken.FullInstruction,
788                sizeof(instructions[0]));
789
790         numInstructions++;
791         break;
792
793      case TGSI_TOKEN_TYPE_PROPERTY:
794         break;
795
796      default:
797         assert( 0 );
798      }
799   }
800   tgsi_parse_free (&parse);
801
802   if (mach->Declarations) {
803      FREE( mach->Declarations );
804   }
805   mach->Declarations = declarations;
806   mach->NumDeclarations = numDeclarations;
807
808   if (mach->Instructions) {
809      FREE( mach->Instructions );
810   }
811   mach->Instructions = instructions;
812   mach->NumInstructions = numInstructions;
813}
814
815
816struct tgsi_exec_machine *
817tgsi_exec_machine_create( void )
818{
819   struct tgsi_exec_machine *mach;
820   uint i;
821
822   mach = align_malloc( sizeof *mach, 16 );
823   if (!mach)
824      goto fail;
825
826   memset(mach, 0, sizeof(*mach));
827
828   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
829   mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
830   mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
831
832   mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_ATTRIBS, 16);
833   mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_ATTRIBS, 16);
834   if (!mach->Inputs || !mach->Outputs)
835      goto fail;
836
837   /* Setup constants needed by the SSE2 executor. */
838   for( i = 0; i < 4; i++ ) {
839      mach->Temps[TGSI_EXEC_TEMP_00000000_I].xyzw[TGSI_EXEC_TEMP_00000000_C].u[i] = 0x00000000;
840      mach->Temps[TGSI_EXEC_TEMP_7FFFFFFF_I].xyzw[TGSI_EXEC_TEMP_7FFFFFFF_C].u[i] = 0x7FFFFFFF;
841      mach->Temps[TGSI_EXEC_TEMP_80000000_I].xyzw[TGSI_EXEC_TEMP_80000000_C].u[i] = 0x80000000;
842      mach->Temps[TGSI_EXEC_TEMP_FFFFFFFF_I].xyzw[TGSI_EXEC_TEMP_FFFFFFFF_C].u[i] = 0xFFFFFFFF;    /* not used */
843      mach->Temps[TGSI_EXEC_TEMP_ONE_I].xyzw[TGSI_EXEC_TEMP_ONE_C].f[i] = 1.0f;
844      mach->Temps[TGSI_EXEC_TEMP_TWO_I].xyzw[TGSI_EXEC_TEMP_TWO_C].f[i] = 2.0f;    /* not used */
845      mach->Temps[TGSI_EXEC_TEMP_128_I].xyzw[TGSI_EXEC_TEMP_128_C].f[i] = 128.0f;
846      mach->Temps[TGSI_EXEC_TEMP_MINUS_128_I].xyzw[TGSI_EXEC_TEMP_MINUS_128_C].f[i] = -128.0f;
847      mach->Temps[TGSI_EXEC_TEMP_THREE_I].xyzw[TGSI_EXEC_TEMP_THREE_C].f[i] = 3.0f;
848      mach->Temps[TGSI_EXEC_TEMP_HALF_I].xyzw[TGSI_EXEC_TEMP_HALF_C].f[i] = 0.5f;
849   }
850
851#ifdef DEBUG
852   /* silence warnings */
853   (void) print_chan;
854   (void) print_temp;
855#endif
856
857   return mach;
858
859fail:
860   if (mach) {
861      align_free(mach->Inputs);
862      align_free(mach->Outputs);
863      align_free(mach);
864   }
865   return NULL;
866}
867
868
869void
870tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
871{
872   if (mach) {
873      if (mach->Instructions)
874         FREE(mach->Instructions);
875      if (mach->Declarations)
876         FREE(mach->Declarations);
877
878      align_free(mach->Inputs);
879      align_free(mach->Outputs);
880
881      align_free(mach);
882   }
883}
884
885static void
886micro_add(union tgsi_exec_channel *dst,
887          const union tgsi_exec_channel *src0,
888          const union tgsi_exec_channel *src1)
889{
890   dst->f[0] = src0->f[0] + src1->f[0];
891   dst->f[1] = src0->f[1] + src1->f[1];
892   dst->f[2] = src0->f[2] + src1->f[2];
893   dst->f[3] = src0->f[3] + src1->f[3];
894}
895
896static void
897micro_div(
898   union tgsi_exec_channel *dst,
899   const union tgsi_exec_channel *src0,
900   const union tgsi_exec_channel *src1 )
901{
902   if (src1->f[0] != 0) {
903      dst->f[0] = src0->f[0] / src1->f[0];
904   }
905   if (src1->f[1] != 0) {
906      dst->f[1] = src0->f[1] / src1->f[1];
907   }
908   if (src1->f[2] != 0) {
909      dst->f[2] = src0->f[2] / src1->f[2];
910   }
911   if (src1->f[3] != 0) {
912      dst->f[3] = src0->f[3] / src1->f[3];
913   }
914}
915
916static void
917micro_rcc(union tgsi_exec_channel *dst,
918          const union tgsi_exec_channel *src)
919{
920   uint i;
921
922   for (i = 0; i < 4; i++) {
923      float recip = 1.0f / src->f[i];
924
925      if (recip > 0.0f) {
926         if (recip > 1.884467e+019f) {
927            dst->f[i] = 1.884467e+019f;
928         }
929         else if (recip < 5.42101e-020f) {
930            dst->f[i] = 5.42101e-020f;
931         }
932         else {
933            dst->f[i] = recip;
934         }
935      }
936      else {
937         if (recip < -1.884467e+019f) {
938            dst->f[i] = -1.884467e+019f;
939         }
940         else if (recip > -5.42101e-020f) {
941            dst->f[i] = -5.42101e-020f;
942         }
943         else {
944            dst->f[i] = recip;
945         }
946      }
947   }
948}
949
950static void
951micro_lt(
952   union tgsi_exec_channel *dst,
953   const union tgsi_exec_channel *src0,
954   const union tgsi_exec_channel *src1,
955   const union tgsi_exec_channel *src2,
956   const union tgsi_exec_channel *src3 )
957{
958   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
959   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
960   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
961   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
962}
963
964static void
965micro_max(union tgsi_exec_channel *dst,
966          const union tgsi_exec_channel *src0,
967          const union tgsi_exec_channel *src1)
968{
969   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
970   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
971   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
972   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
973}
974
975static void
976micro_min(union tgsi_exec_channel *dst,
977          const union tgsi_exec_channel *src0,
978          const union tgsi_exec_channel *src1)
979{
980   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
981   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
982   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
983   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
984}
985
986static void
987micro_mul(union tgsi_exec_channel *dst,
988          const union tgsi_exec_channel *src0,
989          const union tgsi_exec_channel *src1)
990{
991   dst->f[0] = src0->f[0] * src1->f[0];
992   dst->f[1] = src0->f[1] * src1->f[1];
993   dst->f[2] = src0->f[2] * src1->f[2];
994   dst->f[3] = src0->f[3] * src1->f[3];
995}
996
997static void
998micro_neg(
999   union tgsi_exec_channel *dst,
1000   const union tgsi_exec_channel *src )
1001{
1002   dst->f[0] = -src->f[0];
1003   dst->f[1] = -src->f[1];
1004   dst->f[2] = -src->f[2];
1005   dst->f[3] = -src->f[3];
1006}
1007
1008static void
1009micro_pow(
1010   union tgsi_exec_channel *dst,
1011   const union tgsi_exec_channel *src0,
1012   const union tgsi_exec_channel *src1 )
1013{
1014#if FAST_MATH
1015   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
1016   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
1017   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
1018   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
1019#else
1020   dst->f[0] = powf( src0->f[0], src1->f[0] );
1021   dst->f[1] = powf( src0->f[1], src1->f[1] );
1022   dst->f[2] = powf( src0->f[2], src1->f[2] );
1023   dst->f[3] = powf( src0->f[3], src1->f[3] );
1024#endif
1025}
1026
1027static void
1028micro_sub(union tgsi_exec_channel *dst,
1029          const union tgsi_exec_channel *src0,
1030          const union tgsi_exec_channel *src1)
1031{
1032   dst->f[0] = src0->f[0] - src1->f[0];
1033   dst->f[1] = src0->f[1] - src1->f[1];
1034   dst->f[2] = src0->f[2] - src1->f[2];
1035   dst->f[3] = src0->f[3] - src1->f[3];
1036}
1037
1038static void
1039fetch_src_file_channel(const struct tgsi_exec_machine *mach,
1040                       const uint file,
1041                       const uint swizzle,
1042                       const union tgsi_exec_channel *index,
1043                       const union tgsi_exec_channel *index2D,
1044                       union tgsi_exec_channel *chan)
1045{
1046   uint i;
1047
1048   assert(swizzle < 4);
1049
1050   switch (file) {
1051   case TGSI_FILE_CONSTANT:
1052      for (i = 0; i < QUAD_SIZE; i++) {
1053         assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
1054         assert(mach->Consts[index2D->i[i]]);
1055
1056         if (index->i[i] < 0) {
1057            chan->u[i] = 0;
1058         } else {
1059            /* NOTE: copying the const value as a uint instead of float */
1060            const uint constbuf = index2D->i[i];
1061            const uint *buf = (const uint *)mach->Consts[constbuf];
1062            const int pos = index->i[i] * 4 + swizzle;
1063            /* const buffer bounds check */
1064            if (pos < 0 || pos >= mach->ConstsSize[constbuf]) {
1065               if (0) {
1066                  /* Debug: print warning */
1067                  static int count = 0;
1068                  if (count++ < 100)
1069                     debug_printf("TGSI Exec: const buffer index %d"
1070                                  " out of bounds\n", pos);
1071               }
1072               chan->u[i] = 0;
1073            }
1074            else
1075               chan->u[i] = buf[pos];
1076         }
1077      }
1078      break;
1079
1080   case TGSI_FILE_INPUT:
1081      for (i = 0; i < QUAD_SIZE; i++) {
1082         /*
1083         if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1084            debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
1085                         index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
1086                         index2D->i[i], index->i[i]);
1087                         }*/
1088         int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
1089         assert(pos >= 0);
1090         assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
1091         chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
1092      }
1093      break;
1094
1095   case TGSI_FILE_SYSTEM_VALUE:
1096      /* XXX no swizzling at this point.  Will be needed if we put
1097       * gl_FragCoord, for example, in a sys value register.
1098       */
1099      for (i = 0; i < QUAD_SIZE; i++) {
1100         chan->f[i] = mach->SystemValue[index->i[i]][0];
1101      }
1102      break;
1103
1104   case TGSI_FILE_TEMPORARY:
1105      for (i = 0; i < QUAD_SIZE; i++) {
1106         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1107         assert(index2D->i[i] == 0);
1108
1109         chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1110      }
1111      break;
1112
1113   case TGSI_FILE_TEMPORARY_ARRAY:
1114      for (i = 0; i < QUAD_SIZE; i++) {
1115         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1116         assert(index2D->i[i] < TGSI_EXEC_NUM_TEMP_ARRAYS);
1117
1118         chan->u[i] =
1119            mach->TempArray[index2D->i[i]][index->i[i]].xyzw[swizzle].u[i];
1120      }
1121      break;
1122
1123   case TGSI_FILE_IMMEDIATE:
1124      for (i = 0; i < QUAD_SIZE; i++) {
1125         assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1126         assert(index2D->i[i] == 0);
1127
1128         chan->f[i] = mach->Imms[index->i[i]][swizzle];
1129      }
1130      break;
1131
1132   case TGSI_FILE_IMMEDIATE_ARRAY:
1133      for (i = 0; i < QUAD_SIZE; i++) {
1134         assert(index2D->i[i] == 0);
1135
1136         chan->f[i] = mach->ImmArray[index->i[i]][swizzle];
1137      }
1138      break;
1139
1140   case TGSI_FILE_ADDRESS:
1141      for (i = 0; i < QUAD_SIZE; i++) {
1142         assert(index->i[i] >= 0);
1143         assert(index2D->i[i] == 0);
1144
1145         chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1146      }
1147      break;
1148
1149   case TGSI_FILE_PREDICATE:
1150      for (i = 0; i < QUAD_SIZE; i++) {
1151         assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
1152         assert(index2D->i[i] == 0);
1153
1154         chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
1155      }
1156      break;
1157
1158   case TGSI_FILE_OUTPUT:
1159      /* vertex/fragment output vars can be read too */
1160      for (i = 0; i < QUAD_SIZE; i++) {
1161         assert(index->i[i] >= 0);
1162         assert(index2D->i[i] == 0);
1163
1164         chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1165      }
1166      break;
1167
1168   default:
1169      assert(0);
1170      for (i = 0; i < QUAD_SIZE; i++) {
1171         chan->u[i] = 0;
1172      }
1173   }
1174}
1175
1176static void
1177fetch_source(const struct tgsi_exec_machine *mach,
1178             union tgsi_exec_channel *chan,
1179             const struct tgsi_full_src_register *reg,
1180             const uint chan_index,
1181             enum tgsi_exec_datatype src_datatype)
1182{
1183   union tgsi_exec_channel index;
1184   union tgsi_exec_channel index2D;
1185   uint swizzle;
1186
1187   /* We start with a direct index into a register file.
1188    *
1189    *    file[1],
1190    *    where:
1191    *       file = Register.File
1192    *       [1] = Register.Index
1193    */
1194   index.i[0] =
1195   index.i[1] =
1196   index.i[2] =
1197   index.i[3] = reg->Register.Index;
1198
1199   /* There is an extra source register that indirectly subscripts
1200    * a register file. The direct index now becomes an offset
1201    * that is being added to the indirect register.
1202    *
1203    *    file[ind[2].x+1],
1204    *    where:
1205    *       ind = Indirect.File
1206    *       [2] = Indirect.Index
1207    *       .x = Indirect.SwizzleX
1208    */
1209   if (reg->Register.Indirect) {
1210      union tgsi_exec_channel index2;
1211      union tgsi_exec_channel indir_index;
1212      const uint execmask = mach->ExecMask;
1213      uint i;
1214
1215      /* which address register (always zero now) */
1216      index2.i[0] =
1217      index2.i[1] =
1218      index2.i[2] =
1219      index2.i[3] = reg->Indirect.Index;
1220      assert(reg->Indirect.File == TGSI_FILE_ADDRESS);
1221      /* get current value of address register[swizzle] */
1222      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1223      fetch_src_file_channel(mach,
1224                             reg->Indirect.File,
1225                             swizzle,
1226                             &index2,
1227                             &ZeroVec,
1228                             &indir_index);
1229
1230      /* add value of address register to the offset */
1231      index.i[0] += indir_index.i[0];
1232      index.i[1] += indir_index.i[1];
1233      index.i[2] += indir_index.i[2];
1234      index.i[3] += indir_index.i[3];
1235
1236      /* for disabled execution channels, zero-out the index to
1237       * avoid using a potential garbage value.
1238       */
1239      for (i = 0; i < QUAD_SIZE; i++) {
1240         if ((execmask & (1 << i)) == 0)
1241            index.i[i] = 0;
1242      }
1243   }
1244
1245   /* There is an extra source register that is a second
1246    * subscript to a register file. Effectively it means that
1247    * the register file is actually a 2D array of registers.
1248    *
1249    *    file[3][1],
1250    *    where:
1251    *       [3] = Dimension.Index
1252    */
1253   if (reg->Register.Dimension) {
1254      index2D.i[0] =
1255      index2D.i[1] =
1256      index2D.i[2] =
1257      index2D.i[3] = reg->Dimension.Index;
1258
1259      /* Again, the second subscript index can be addressed indirectly
1260       * identically to the first one.
1261       * Nothing stops us from indirectly addressing the indirect register,
1262       * but there is no need for that, so we won't exercise it.
1263       *
1264       *    file[ind[4].y+3][1],
1265       *    where:
1266       *       ind = DimIndirect.File
1267       *       [4] = DimIndirect.Index
1268       *       .y = DimIndirect.SwizzleX
1269       */
1270      if (reg->Dimension.Indirect) {
1271         union tgsi_exec_channel index2;
1272         union tgsi_exec_channel indir_index;
1273         const uint execmask = mach->ExecMask;
1274         uint i;
1275
1276         index2.i[0] =
1277         index2.i[1] =
1278         index2.i[2] =
1279         index2.i[3] = reg->DimIndirect.Index;
1280
1281         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1282         fetch_src_file_channel(mach,
1283                                reg->DimIndirect.File,
1284                                swizzle,
1285                                &index2,
1286                                &ZeroVec,
1287                                &indir_index);
1288
1289         index2D.i[0] += indir_index.i[0];
1290         index2D.i[1] += indir_index.i[1];
1291         index2D.i[2] += indir_index.i[2];
1292         index2D.i[3] += indir_index.i[3];
1293
1294         /* for disabled execution channels, zero-out the index to
1295          * avoid using a potential garbage value.
1296          */
1297         for (i = 0; i < QUAD_SIZE; i++) {
1298            if ((execmask & (1 << i)) == 0) {
1299               index2D.i[i] = 0;
1300            }
1301         }
1302      }
1303
1304      /* If by any chance there was a need for a 3D array of register
1305       * files, we would have to check whether Dimension is followed
1306       * by a dimension register and continue the saga.
1307       */
1308   } else {
1309      index2D.i[0] =
1310      index2D.i[1] =
1311      index2D.i[2] =
1312      index2D.i[3] = 0;
1313   }
1314
1315   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1316   fetch_src_file_channel(mach,
1317                          reg->Register.File,
1318                          swizzle,
1319                          &index,
1320                          &index2D,
1321                          chan);
1322
1323   if (reg->Register.Absolute) {
1324      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1325         micro_abs(chan, chan);
1326      } else {
1327         micro_iabs(chan, chan);
1328      }
1329   }
1330
1331   if (reg->Register.Negate) {
1332      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1333         micro_neg(chan, chan);
1334      } else {
1335         micro_ineg(chan, chan);
1336      }
1337   }
1338}
1339
1340static void
1341store_dest(struct tgsi_exec_machine *mach,
1342           const union tgsi_exec_channel *chan,
1343           const struct tgsi_full_dst_register *reg,
1344           const struct tgsi_full_instruction *inst,
1345           uint chan_index,
1346           enum tgsi_exec_datatype dst_datatype)
1347{
1348   uint i;
1349   union tgsi_exec_channel null;
1350   union tgsi_exec_channel *dst;
1351   union tgsi_exec_channel index2D;
1352   uint execmask = mach->ExecMask;
1353   int offset = 0;  /* indirection offset */
1354   int index;
1355
1356   /* for debugging */
1357   if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1358      check_inf_or_nan(chan);
1359   }
1360
1361   /* There is an extra source register that indirectly subscripts
1362    * a register file. The direct index now becomes an offset
1363    * that is being added to the indirect register.
1364    *
1365    *    file[ind[2].x+1],
1366    *    where:
1367    *       ind = Indirect.File
1368    *       [2] = Indirect.Index
1369    *       .x = Indirect.SwizzleX
1370    */
1371   if (reg->Register.Indirect) {
1372      union tgsi_exec_channel index;
1373      union tgsi_exec_channel indir_index;
1374      uint swizzle;
1375
1376      /* which address register (always zero for now) */
1377      index.i[0] =
1378      index.i[1] =
1379      index.i[2] =
1380      index.i[3] = reg->Indirect.Index;
1381
1382      /* get current value of address register[swizzle] */
1383      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1384
1385      /* fetch values from the address/indirection register */
1386      fetch_src_file_channel(mach,
1387                             reg->Indirect.File,
1388                             swizzle,
1389                             &index,
1390                             &ZeroVec,
1391                             &indir_index);
1392
1393      /* save indirection offset */
1394      offset = indir_index.i[0];
1395   }
1396
1397   /* There is an extra source register that is a second
1398    * subscript to a register file. Effectively it means that
1399    * the register file is actually a 2D array of registers.
1400    *
1401    *    file[3][1],
1402    *    where:
1403    *       [3] = Dimension.Index
1404    */
1405   if (reg->Register.Dimension) {
1406      index2D.i[0] =
1407      index2D.i[1] =
1408      index2D.i[2] =
1409      index2D.i[3] = reg->Dimension.Index;
1410
1411      /* Again, the second subscript index can be addressed indirectly
1412       * identically to the first one.
1413       * Nothing stops us from indirectly addressing the indirect register,
1414       * but there is no need for that, so we won't exercise it.
1415       *
1416       *    file[ind[4].y+3][1],
1417       *    where:
1418       *       ind = DimIndirect.File
1419       *       [4] = DimIndirect.Index
1420       *       .y = DimIndirect.SwizzleX
1421       */
1422      if (reg->Dimension.Indirect) {
1423         union tgsi_exec_channel index2;
1424         union tgsi_exec_channel indir_index;
1425         const uint execmask = mach->ExecMask;
1426         unsigned swizzle;
1427         uint i;
1428
1429         index2.i[0] =
1430         index2.i[1] =
1431         index2.i[2] =
1432         index2.i[3] = reg->DimIndirect.Index;
1433
1434         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1435         fetch_src_file_channel(mach,
1436                                reg->DimIndirect.File,
1437                                swizzle,
1438                                &index2,
1439                                &ZeroVec,
1440                                &indir_index);
1441
1442         index2D.i[0] += indir_index.i[0];
1443         index2D.i[1] += indir_index.i[1];
1444         index2D.i[2] += indir_index.i[2];
1445         index2D.i[3] += indir_index.i[3];
1446
1447         /* for disabled execution channels, zero-out the index to
1448          * avoid using a potential garbage value.
1449          */
1450         for (i = 0; i < QUAD_SIZE; i++) {
1451            if ((execmask & (1 << i)) == 0) {
1452               index2D.i[i] = 0;
1453            }
1454         }
1455      }
1456
1457      /* If by any chance there was a need for a 3D array of register
1458       * files, we would have to check whether Dimension is followed
1459       * by a dimension register and continue the saga.
1460       */
1461   } else {
1462      index2D.i[0] =
1463      index2D.i[1] =
1464      index2D.i[2] =
1465      index2D.i[3] = 0;
1466   }
1467
1468   switch (reg->Register.File) {
1469   case TGSI_FILE_NULL:
1470      dst = &null;
1471      break;
1472
1473   case TGSI_FILE_OUTPUT:
1474      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1475         + reg->Register.Index;
1476      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1477#if 0
1478      if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1479         fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1480         for (i = 0; i < QUAD_SIZE; i++)
1481            if (execmask & (1 << i))
1482               fprintf(stderr, "%f, ", chan->f[i]);
1483         fprintf(stderr, ")\n");
1484      }
1485#endif
1486      break;
1487
1488   case TGSI_FILE_TEMPORARY:
1489      index = reg->Register.Index;
1490      assert( index < TGSI_EXEC_NUM_TEMPS );
1491      dst = &mach->Temps[offset + index].xyzw[chan_index];
1492      break;
1493
1494   case TGSI_FILE_TEMPORARY_ARRAY:
1495      index = reg->Register.Index;
1496      assert( index < TGSI_EXEC_NUM_TEMPS );
1497      assert( index2D.i[0] < TGSI_EXEC_NUM_TEMP_ARRAYS );
1498      /* XXX we use index2D.i[0] here but somehow we might
1499       * end up with someone trying to store indirectly in
1500       * different buffers */
1501      dst = &mach->TempArray[index2D.i[0]][offset + index].xyzw[chan_index];
1502      break;
1503
1504   case TGSI_FILE_ADDRESS:
1505      index = reg->Register.Index;
1506      dst = &mach->Addrs[index].xyzw[chan_index];
1507      break;
1508
1509   case TGSI_FILE_PREDICATE:
1510      index = reg->Register.Index;
1511      assert(index < TGSI_EXEC_NUM_PREDS);
1512      dst = &mach->Predicates[index].xyzw[chan_index];
1513      break;
1514
1515   default:
1516      assert( 0 );
1517      return;
1518   }
1519
1520   if (inst->Instruction.Predicate) {
1521      uint swizzle;
1522      union tgsi_exec_channel *pred;
1523
1524      switch (chan_index) {
1525      case CHAN_X:
1526         swizzle = inst->Predicate.SwizzleX;
1527         break;
1528      case CHAN_Y:
1529         swizzle = inst->Predicate.SwizzleY;
1530         break;
1531      case CHAN_Z:
1532         swizzle = inst->Predicate.SwizzleZ;
1533         break;
1534      case CHAN_W:
1535         swizzle = inst->Predicate.SwizzleW;
1536         break;
1537      default:
1538         assert(0);
1539         return;
1540      }
1541
1542      assert(inst->Predicate.Index == 0);
1543
1544      pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1545
1546      if (inst->Predicate.Negate) {
1547         for (i = 0; i < QUAD_SIZE; i++) {
1548            if (pred->u[i]) {
1549               execmask &= ~(1 << i);
1550            }
1551         }
1552      } else {
1553         for (i = 0; i < QUAD_SIZE; i++) {
1554            if (!pred->u[i]) {
1555               execmask &= ~(1 << i);
1556            }
1557         }
1558      }
1559   }
1560
1561   switch (inst->Instruction.Saturate) {
1562   case TGSI_SAT_NONE:
1563      for (i = 0; i < QUAD_SIZE; i++)
1564         if (execmask & (1 << i))
1565            dst->i[i] = chan->i[i];
1566      break;
1567
1568   case TGSI_SAT_ZERO_ONE:
1569      for (i = 0; i < QUAD_SIZE; i++)
1570         if (execmask & (1 << i)) {
1571            if (chan->f[i] < 0.0f)
1572               dst->f[i] = 0.0f;
1573            else if (chan->f[i] > 1.0f)
1574               dst->f[i] = 1.0f;
1575            else
1576               dst->i[i] = chan->i[i];
1577         }
1578      break;
1579
1580   case TGSI_SAT_MINUS_PLUS_ONE:
1581      for (i = 0; i < QUAD_SIZE; i++)
1582         if (execmask & (1 << i)) {
1583            if (chan->f[i] < -1.0f)
1584               dst->f[i] = -1.0f;
1585            else if (chan->f[i] > 1.0f)
1586               dst->f[i] = 1.0f;
1587            else
1588               dst->i[i] = chan->i[i];
1589         }
1590      break;
1591
1592   default:
1593      assert( 0 );
1594   }
1595}
1596
1597#define FETCH(VAL,INDEX,CHAN)\
1598    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1599
1600#define IFETCH(VAL,INDEX,CHAN)\
1601    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_INT)
1602
1603
1604/**
1605 * Execute ARB-style KIL which is predicated by a src register.
1606 * Kill fragment if any of the four values is less than zero.
1607 */
1608static void
1609exec_kil(struct tgsi_exec_machine *mach,
1610         const struct tgsi_full_instruction *inst)
1611{
1612   uint uniquemask;
1613   uint chan_index;
1614   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1615   union tgsi_exec_channel r[1];
1616
1617   /* This mask stores component bits that were already tested. */
1618   uniquemask = 0;
1619
1620   for (chan_index = 0; chan_index < 4; chan_index++)
1621   {
1622      uint swizzle;
1623      uint i;
1624
1625      /* unswizzle channel */
1626      swizzle = tgsi_util_get_full_src_register_swizzle (
1627                        &inst->Src[0],
1628                        chan_index);
1629
1630      /* check if the component has not been already tested */
1631      if (uniquemask & (1 << swizzle))
1632         continue;
1633      uniquemask |= 1 << swizzle;
1634
1635      FETCH(&r[0], 0, chan_index);
1636      for (i = 0; i < 4; i++)
1637         if (r[0].f[i] < 0.0f)
1638            kilmask |= 1 << i;
1639   }
1640
1641   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1642}
1643
1644/**
1645 * Execute NVIDIA-style KIL which is predicated by a condition code.
1646 * Kill fragment if the condition code is TRUE.
1647 */
1648static void
1649exec_kilp(struct tgsi_exec_machine *mach,
1650          const struct tgsi_full_instruction *inst)
1651{
1652   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1653
1654   /* "unconditional" kil */
1655   kilmask = mach->ExecMask;
1656   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1657}
1658
1659static void
1660emit_vertex(struct tgsi_exec_machine *mach)
1661{
1662   /* FIXME: check for exec mask correctly
1663   unsigned i;
1664   for (i = 0; i < QUAD_SIZE; ++i) {
1665         if ((mach->ExecMask & (1 << i)))
1666   */
1667   if (mach->ExecMask) {
1668      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1669      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1670   }
1671}
1672
1673static void
1674emit_primitive(struct tgsi_exec_machine *mach)
1675{
1676   unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1677   /* FIXME: check for exec mask correctly
1678   unsigned i;
1679   for (i = 0; i < QUAD_SIZE; ++i) {
1680         if ((mach->ExecMask & (1 << i)))
1681   */
1682   if (mach->ExecMask) {
1683      ++(*prim_count);
1684      debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1685      mach->Primitives[*prim_count] = 0;
1686   }
1687}
1688
1689static void
1690conditional_emit_primitive(struct tgsi_exec_machine *mach)
1691{
1692   if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1693      int emitted_verts =
1694         mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]];
1695      if (emitted_verts) {
1696         emit_primitive(mach);
1697      }
1698   }
1699}
1700
1701
1702/*
1703 * Fetch four texture samples using STR texture coordinates.
1704 */
1705static void
1706fetch_texel( struct tgsi_sampler *sampler,
1707             const union tgsi_exec_channel *s,
1708             const union tgsi_exec_channel *t,
1709             const union tgsi_exec_channel *p,
1710             const union tgsi_exec_channel *c0,
1711             enum tgsi_sampler_control control,
1712             union tgsi_exec_channel *r,
1713             union tgsi_exec_channel *g,
1714             union tgsi_exec_channel *b,
1715             union tgsi_exec_channel *a )
1716{
1717   uint j;
1718   float rgba[NUM_CHANNELS][QUAD_SIZE];
1719
1720   sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
1721
1722   for (j = 0; j < 4; j++) {
1723      r->f[j] = rgba[0][j];
1724      g->f[j] = rgba[1][j];
1725      b->f[j] = rgba[2][j];
1726      a->f[j] = rgba[3][j];
1727   }
1728}
1729
1730
1731#define TEX_MODIFIER_NONE           0
1732#define TEX_MODIFIER_PROJECTED      1
1733#define TEX_MODIFIER_LOD_BIAS       2
1734#define TEX_MODIFIER_EXPLICIT_LOD   3
1735
1736
1737static void
1738exec_tex(struct tgsi_exec_machine *mach,
1739         const struct tgsi_full_instruction *inst,
1740         uint modifier)
1741{
1742   const uint unit = inst->Src[1].Register.Index;
1743   union tgsi_exec_channel r[4];
1744   const union tgsi_exec_channel *lod = &ZeroVec;
1745   enum tgsi_sampler_control control;
1746   uint chan;
1747
1748   if (modifier != TEX_MODIFIER_NONE) {
1749      FETCH(&r[3], 0, CHAN_W);
1750      if (modifier != TEX_MODIFIER_PROJECTED) {
1751         lod = &r[3];
1752      }
1753   }
1754
1755   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
1756      control = tgsi_sampler_lod_explicit;
1757   } else {
1758      control = tgsi_sampler_lod_bias;
1759   }
1760
1761   switch (inst->Texture.Texture) {
1762   case TGSI_TEXTURE_1D:
1763   case TGSI_TEXTURE_SHADOW1D:
1764      FETCH(&r[0], 0, CHAN_X);
1765
1766      if (modifier == TEX_MODIFIER_PROJECTED) {
1767         micro_div(&r[0], &r[0], &r[3]);
1768      }
1769
1770      fetch_texel(mach->Samplers[unit],
1771                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
1772                  control,
1773                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1774      break;
1775
1776   case TGSI_TEXTURE_2D:
1777   case TGSI_TEXTURE_RECT:
1778   case TGSI_TEXTURE_SHADOW2D:
1779   case TGSI_TEXTURE_SHADOWRECT:
1780      FETCH(&r[0], 0, CHAN_X);
1781      FETCH(&r[1], 0, CHAN_Y);
1782      FETCH(&r[2], 0, CHAN_Z);
1783
1784      if (modifier == TEX_MODIFIER_PROJECTED) {
1785         micro_div(&r[0], &r[0], &r[3]);
1786         micro_div(&r[1], &r[1], &r[3]);
1787         micro_div(&r[2], &r[2], &r[3]);
1788      }
1789
1790      fetch_texel(mach->Samplers[unit],
1791                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1792                  control,
1793                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1794      break;
1795
1796   case TGSI_TEXTURE_1D_ARRAY:
1797   case TGSI_TEXTURE_SHADOW1D_ARRAY:
1798      FETCH(&r[0], 0, CHAN_X);
1799      FETCH(&r[1], 0, CHAN_Y);
1800
1801      if (modifier == TEX_MODIFIER_PROJECTED) {
1802         micro_div(&r[0], &r[0], &r[3]);
1803      }
1804
1805      fetch_texel(mach->Samplers[unit],
1806                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1807                  control,
1808                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1809      break;
1810
1811   case TGSI_TEXTURE_2D_ARRAY:
1812   case TGSI_TEXTURE_SHADOW2D_ARRAY:
1813      FETCH(&r[0], 0, CHAN_X);
1814      FETCH(&r[1], 0, CHAN_Y);
1815      FETCH(&r[2], 0, CHAN_Z);
1816
1817      if (modifier == TEX_MODIFIER_PROJECTED) {
1818         micro_div(&r[0], &r[0], &r[3]);
1819         micro_div(&r[1], &r[1], &r[3]);
1820      }
1821
1822      fetch_texel(mach->Samplers[unit],
1823                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1824                  control,
1825                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1826      break;
1827
1828   case TGSI_TEXTURE_3D:
1829   case TGSI_TEXTURE_CUBE:
1830      FETCH(&r[0], 0, CHAN_X);
1831      FETCH(&r[1], 0, CHAN_Y);
1832      FETCH(&r[2], 0, CHAN_Z);
1833
1834      if (modifier == TEX_MODIFIER_PROJECTED) {
1835         micro_div(&r[0], &r[0], &r[3]);
1836         micro_div(&r[1], &r[1], &r[3]);
1837         micro_div(&r[2], &r[2], &r[3]);
1838      }
1839
1840      fetch_texel(mach->Samplers[unit],
1841                  &r[0], &r[1], &r[2], lod,
1842                  control,
1843                  &r[0], &r[1], &r[2], &r[3]);
1844      break;
1845
1846   default:
1847      assert(0);
1848   }
1849
1850#if 0
1851   debug_printf("fetch r: %g %g %g %g\n",
1852         r[0].f[0], r[0].f[1], r[0].f[2], r[0].f[3]);
1853   debug_printf("fetch g: %g %g %g %g\n",
1854         r[1].f[0], r[1].f[1], r[1].f[2], r[1].f[3]);
1855   debug_printf("fetch b: %g %g %g %g\n",
1856         r[2].f[0], r[2].f[1], r[2].f[2], r[2].f[3]);
1857   debug_printf("fetch a: %g %g %g %g\n",
1858         r[3].f[0], r[3].f[1], r[3].f[2], r[3].f[3]);
1859#endif
1860
1861   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1862      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1863         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1864      }
1865   }
1866}
1867
1868static void
1869exec_txd(struct tgsi_exec_machine *mach,
1870         const struct tgsi_full_instruction *inst)
1871{
1872   const uint unit = inst->Src[3].Register.Index;
1873   union tgsi_exec_channel r[4];
1874   uint chan;
1875
1876   /*
1877    * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1878    */
1879
1880   switch (inst->Texture.Texture) {
1881   case TGSI_TEXTURE_1D:
1882   case TGSI_TEXTURE_SHADOW1D:
1883
1884      FETCH(&r[0], 0, CHAN_X);
1885
1886      fetch_texel(mach->Samplers[unit],
1887                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
1888                  tgsi_sampler_lod_bias,
1889                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
1890      break;
1891
1892   case TGSI_TEXTURE_1D_ARRAY:
1893   case TGSI_TEXTURE_2D:
1894   case TGSI_TEXTURE_RECT:
1895   case TGSI_TEXTURE_SHADOW1D_ARRAY:
1896   case TGSI_TEXTURE_SHADOW2D:
1897   case TGSI_TEXTURE_SHADOWRECT:
1898
1899      FETCH(&r[0], 0, CHAN_X);
1900      FETCH(&r[1], 0, CHAN_Y);
1901      FETCH(&r[2], 0, CHAN_Z);
1902
1903      fetch_texel(mach->Samplers[unit],
1904                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
1905                  tgsi_sampler_lod_bias,
1906                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
1907      break;
1908
1909   case TGSI_TEXTURE_2D_ARRAY:
1910   case TGSI_TEXTURE_3D:
1911   case TGSI_TEXTURE_CUBE:
1912
1913      FETCH(&r[0], 0, CHAN_X);
1914      FETCH(&r[1], 0, CHAN_Y);
1915      FETCH(&r[2], 0, CHAN_Z);
1916
1917      fetch_texel(mach->Samplers[unit],
1918                  &r[0], &r[1], &r[2], &ZeroVec,
1919                  tgsi_sampler_lod_bias,
1920                  &r[0], &r[1], &r[2], &r[3]);
1921      break;
1922
1923   case TGSI_TEXTURE_SHADOW2D_ARRAY:
1924
1925      FETCH(&r[0], 0, CHAN_X);
1926      FETCH(&r[1], 0, CHAN_Y);
1927      FETCH(&r[2], 0, CHAN_Z);
1928      FETCH(&r[3], 0, CHAN_W);
1929
1930      fetch_texel(mach->Samplers[unit],
1931                  &r[0], &r[1], &r[2], &r[3],
1932                  tgsi_sampler_lod_bias,
1933                  &r[0], &r[1], &r[2], &r[3]);
1934      break;
1935
1936   default:
1937      assert(0);
1938   }
1939
1940   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1941      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1942         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1943      }
1944   }
1945}
1946
1947
1948static void
1949exec_txf(struct tgsi_exec_machine *mach,
1950	 const struct tgsi_full_instruction *inst)
1951{
1952   struct tgsi_sampler *sampler;
1953   const uint unit = inst->Src[2].Register.Index;
1954   union tgsi_exec_channel r[4];
1955   union tgsi_exec_channel offset[3];
1956   uint chan;
1957   float rgba[NUM_CHANNELS][QUAD_SIZE];
1958   int j;
1959   int8_t offsets[3];
1960
1961   if (inst->Texture.NumOffsets == 1) {
1962      union tgsi_exec_channel index;
1963      index.i[0] = index.i[1] = index.i[2] = index.i[3] = inst->TexOffsets[0].Index;
1964      fetch_src_file_channel(mach, inst->TexOffsets[0].File,
1965                             inst->TexOffsets[0].SwizzleX, &index, &ZeroVec, &offset[0]);
1966      fetch_src_file_channel(mach, inst->TexOffsets[0].File,
1967                             inst->TexOffsets[0].SwizzleY, &index, &ZeroVec, &offset[1]);
1968      fetch_src_file_channel(mach, inst->TexOffsets[0].File,
1969                             inst->TexOffsets[0].SwizzleZ, &index, &ZeroVec, &offset[2]);
1970     offsets[0] = offset[0].i[0];
1971     offsets[1] = offset[1].i[0];
1972     offsets[2] = offset[2].i[0];
1973   } else
1974     offsets[0] = offsets[1] = offsets[2] = 0;
1975
1976   IFETCH(&r[3], 0, CHAN_W);
1977
1978   switch(inst->Texture.Texture) {
1979   case TGSI_TEXTURE_3D:
1980   case TGSI_TEXTURE_2D_ARRAY:
1981   case TGSI_TEXTURE_SHADOW2D_ARRAY:
1982      IFETCH(&r[2], 0, CHAN_Z);
1983      /* fallthrough */
1984   case TGSI_TEXTURE_2D:
1985   case TGSI_TEXTURE_RECT:
1986   case TGSI_TEXTURE_SHADOW1D_ARRAY:
1987   case TGSI_TEXTURE_SHADOW2D:
1988   case TGSI_TEXTURE_SHADOWRECT:
1989   case TGSI_TEXTURE_1D_ARRAY:
1990      IFETCH(&r[1], 0, CHAN_Y);
1991      /* fallthrough */
1992   case TGSI_TEXTURE_1D:
1993   case TGSI_TEXTURE_SHADOW1D:
1994      IFETCH(&r[0], 0, CHAN_X);
1995      break;
1996   default:
1997      assert(0);
1998      break;
1999   }
2000
2001   sampler = mach->Samplers[unit];
2002   sampler->get_texel(sampler, r[0].i, r[1].i, r[2].i, r[3].i,
2003		      offsets, rgba);
2004
2005   for (j = 0; j < QUAD_SIZE; j++) {
2006      r[0].f[j] = rgba[0][j];
2007      r[1].f[j] = rgba[1][j];
2008      r[2].f[j] = rgba[2][j];
2009      r[3].f[j] = rgba[3][j];
2010   }
2011
2012   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2013      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2014         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2015      }
2016   }
2017}
2018
2019static void
2020exec_txq(struct tgsi_exec_machine *mach,
2021         const struct tgsi_full_instruction *inst)
2022{
2023   struct tgsi_sampler *sampler;
2024   const uint unit = inst->Src[1].Register.Index;
2025   int result[4];
2026   union tgsi_exec_channel r[4], src;
2027   uint chan;
2028   int i,j;
2029
2030   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_INT);
2031   sampler = mach->Samplers[unit];
2032
2033   sampler->get_dims(sampler, src.i[0], result);
2034
2035   for (i = 0; i < QUAD_SIZE; i++) {
2036      for (j = 0; j < 4; j++) {
2037	 r[j].i[i] = result[j];
2038      }
2039   }
2040
2041   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2042      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2043	 store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
2044		    TGSI_EXEC_DATA_INT);
2045      }
2046   }
2047}
2048
2049static void
2050exec_sample(struct tgsi_exec_machine *mach,
2051            const struct tgsi_full_instruction *inst,
2052            uint modifier)
2053{
2054   const uint resource_unit = inst->Src[1].Register.Index;
2055   const uint sampler_unit = inst->Src[2].Register.Index;
2056   union tgsi_exec_channel r[4];
2057   const union tgsi_exec_channel *lod = &ZeroVec;
2058   enum tgsi_sampler_control control;
2059   uint chan;
2060
2061   if (modifier != TEX_MODIFIER_NONE) {
2062      if (modifier == TEX_MODIFIER_LOD_BIAS)
2063         FETCH(&r[3], 3, CHAN_X);
2064      else /*TEX_MODIFIER_LOD*/
2065         FETCH(&r[3], 0, CHAN_W);
2066
2067      if (modifier != TEX_MODIFIER_PROJECTED) {
2068         lod = &r[3];
2069      }
2070   }
2071
2072   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
2073      control = tgsi_sampler_lod_explicit;
2074   } else {
2075      control = tgsi_sampler_lod_bias;
2076   }
2077
2078   switch (mach->Resources[resource_unit].Resource) {
2079   case TGSI_TEXTURE_1D:
2080   case TGSI_TEXTURE_SHADOW1D:
2081      FETCH(&r[0], 0, CHAN_X);
2082
2083      if (modifier == TEX_MODIFIER_PROJECTED) {
2084         micro_div(&r[0], &r[0], &r[3]);
2085      }
2086
2087      fetch_texel(mach->Samplers[sampler_unit],
2088                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
2089                  control,
2090                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2091      break;
2092
2093   case TGSI_TEXTURE_1D_ARRAY:
2094   case TGSI_TEXTURE_2D:
2095   case TGSI_TEXTURE_RECT:
2096   case TGSI_TEXTURE_SHADOW1D_ARRAY:
2097   case TGSI_TEXTURE_SHADOW2D:
2098   case TGSI_TEXTURE_SHADOWRECT:
2099      FETCH(&r[0], 0, CHAN_X);
2100      FETCH(&r[1], 0, CHAN_Y);
2101      FETCH(&r[2], 0, CHAN_Z);
2102
2103      if (modifier == TEX_MODIFIER_PROJECTED) {
2104         micro_div(&r[0], &r[0], &r[3]);
2105         micro_div(&r[1], &r[1], &r[3]);
2106         micro_div(&r[2], &r[2], &r[3]);
2107      }
2108
2109      fetch_texel(mach->Samplers[sampler_unit],
2110                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
2111                  control,
2112                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2113      break;
2114
2115   case TGSI_TEXTURE_2D_ARRAY:
2116   case TGSI_TEXTURE_3D:
2117   case TGSI_TEXTURE_CUBE:
2118      FETCH(&r[0], 0, CHAN_X);
2119      FETCH(&r[1], 0, CHAN_Y);
2120      FETCH(&r[2], 0, CHAN_Z);
2121
2122      if (modifier == TEX_MODIFIER_PROJECTED) {
2123         micro_div(&r[0], &r[0], &r[3]);
2124         micro_div(&r[1], &r[1], &r[3]);
2125         micro_div(&r[2], &r[2], &r[3]);
2126      }
2127
2128      fetch_texel(mach->Samplers[sampler_unit],
2129                  &r[0], &r[1], &r[2], lod,
2130                  control,
2131                  &r[0], &r[1], &r[2], &r[3]);
2132      break;
2133
2134   case TGSI_TEXTURE_SHADOW2D_ARRAY:
2135      FETCH(&r[0], 0, CHAN_X);
2136      FETCH(&r[1], 0, CHAN_Y);
2137      FETCH(&r[2], 0, CHAN_Z);
2138      FETCH(&r[3], 0, CHAN_W);
2139
2140      assert(modifier != TEX_MODIFIER_PROJECTED);
2141
2142      fetch_texel(mach->Samplers[sampler_unit],
2143                  &r[0], &r[1], &r[2], &r[3],
2144                  control,
2145                  &r[0], &r[1], &r[2], &r[3]);
2146      break;
2147
2148   default:
2149      assert(0);
2150   }
2151
2152   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2153      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2154         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2155      }
2156   }
2157}
2158
2159static void
2160exec_sample_d(struct tgsi_exec_machine *mach,
2161              const struct tgsi_full_instruction *inst)
2162{
2163   const uint resource_unit = inst->Src[1].Register.Index;
2164   const uint sampler_unit = inst->Src[2].Register.Index;
2165   union tgsi_exec_channel r[4];
2166   uint chan;
2167   /*
2168    * XXX: This is fake SAMPLE_D -- the derivatives are not taken into account, yet.
2169    */
2170
2171   switch (mach->Resources[resource_unit].Resource) {
2172   case TGSI_TEXTURE_1D:
2173   case TGSI_TEXTURE_SHADOW1D:
2174
2175      FETCH(&r[0], 0, CHAN_X);
2176
2177      fetch_texel(mach->Samplers[sampler_unit],
2178                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
2179                  tgsi_sampler_lod_bias,
2180                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2181      break;
2182
2183   case TGSI_TEXTURE_2D:
2184   case TGSI_TEXTURE_RECT:
2185   case TGSI_TEXTURE_SHADOW2D:
2186   case TGSI_TEXTURE_SHADOWRECT:
2187
2188      FETCH(&r[0], 0, CHAN_X);
2189      FETCH(&r[1], 0, CHAN_Y);
2190      FETCH(&r[2], 0, CHAN_Z);
2191
2192      fetch_texel(mach->Samplers[sampler_unit],
2193                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
2194                  tgsi_sampler_lod_bias,
2195                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2196      break;
2197
2198   case TGSI_TEXTURE_3D:
2199   case TGSI_TEXTURE_CUBE:
2200
2201      FETCH(&r[0], 0, CHAN_X);
2202      FETCH(&r[1], 0, CHAN_Y);
2203      FETCH(&r[2], 0, CHAN_Z);
2204
2205      fetch_texel(mach->Samplers[sampler_unit],
2206                  &r[0], &r[1], &r[2], &ZeroVec,
2207                  tgsi_sampler_lod_bias,
2208                  &r[0], &r[1], &r[2], &r[3]);
2209      break;
2210
2211   default:
2212      assert(0);
2213   }
2214
2215   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2216      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2217         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2218      }
2219   }
2220}
2221
2222
2223/**
2224 * Evaluate a constant-valued coefficient at the position of the
2225 * current quad.
2226 */
2227static void
2228eval_constant_coef(
2229   struct tgsi_exec_machine *mach,
2230   unsigned attrib,
2231   unsigned chan )
2232{
2233   unsigned i;
2234
2235   for( i = 0; i < QUAD_SIZE; i++ ) {
2236      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
2237   }
2238}
2239
2240/**
2241 * Evaluate a linear-valued coefficient at the position of the
2242 * current quad.
2243 */
2244static void
2245eval_linear_coef(
2246   struct tgsi_exec_machine *mach,
2247   unsigned attrib,
2248   unsigned chan )
2249{
2250   const float x = mach->QuadPos.xyzw[0].f[0];
2251   const float y = mach->QuadPos.xyzw[1].f[0];
2252   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2253   const float dady = mach->InterpCoefs[attrib].dady[chan];
2254   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2255   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
2256   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
2257   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
2258   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
2259}
2260
2261/**
2262 * Evaluate a perspective-valued coefficient at the position of the
2263 * current quad.
2264 */
2265static void
2266eval_perspective_coef(
2267   struct tgsi_exec_machine *mach,
2268   unsigned attrib,
2269   unsigned chan )
2270{
2271   const float x = mach->QuadPos.xyzw[0].f[0];
2272   const float y = mach->QuadPos.xyzw[1].f[0];
2273   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2274   const float dady = mach->InterpCoefs[attrib].dady[chan];
2275   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2276   const float *w = mach->QuadPos.xyzw[3].f;
2277   /* divide by W here */
2278   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
2279   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
2280   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
2281   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
2282}
2283
2284
2285typedef void (* eval_coef_func)(
2286   struct tgsi_exec_machine *mach,
2287   unsigned attrib,
2288   unsigned chan );
2289
2290static void
2291exec_declaration(struct tgsi_exec_machine *mach,
2292                 const struct tgsi_full_declaration *decl)
2293{
2294   if (decl->Declaration.File == TGSI_FILE_RESOURCE) {
2295      mach->Resources[decl->Range.First] = decl->Resource;
2296      return;
2297   }
2298
2299   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
2300      if (decl->Declaration.File == TGSI_FILE_INPUT) {
2301         uint first, last, mask;
2302
2303         first = decl->Range.First;
2304         last = decl->Range.Last;
2305         mask = decl->Declaration.UsageMask;
2306
2307         /* XXX we could remove this special-case code since
2308          * mach->InterpCoefs[first].a0 should already have the
2309          * front/back-face value.  But we should first update the
2310          * ureg code to emit the right UsageMask value (WRITEMASK_X).
2311          * Then, we could remove the tgsi_exec_machine::Face field.
2312          */
2313         /* XXX make FACE a system value */
2314         if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
2315            uint i;
2316
2317            assert(decl->Semantic.Index == 0);
2318            assert(first == last);
2319
2320            for (i = 0; i < QUAD_SIZE; i++) {
2321               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
2322            }
2323         } else {
2324            eval_coef_func eval;
2325            uint i, j;
2326
2327            switch (decl->Declaration.Interpolate) {
2328            case TGSI_INTERPOLATE_CONSTANT:
2329               eval = eval_constant_coef;
2330               break;
2331
2332            case TGSI_INTERPOLATE_LINEAR:
2333               eval = eval_linear_coef;
2334               break;
2335
2336            case TGSI_INTERPOLATE_PERSPECTIVE:
2337               eval = eval_perspective_coef;
2338               break;
2339
2340            default:
2341               assert(0);
2342               return;
2343            }
2344
2345            for (j = 0; j < NUM_CHANNELS; j++) {
2346               if (mask & (1 << j)) {
2347                  for (i = first; i <= last; i++) {
2348                     eval(mach, i, j);
2349                  }
2350               }
2351            }
2352         }
2353      }
2354   }
2355
2356   if (decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
2357      mach->SysSemanticToIndex[decl->Declaration.Semantic] = decl->Range.First;
2358   }
2359}
2360
2361
2362typedef void (* micro_op)(union tgsi_exec_channel *dst);
2363
2364static void
2365exec_vector(struct tgsi_exec_machine *mach,
2366            const struct tgsi_full_instruction *inst,
2367            micro_op op,
2368            enum tgsi_exec_datatype dst_datatype)
2369{
2370   unsigned int chan;
2371
2372   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2373      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2374         union tgsi_exec_channel dst;
2375
2376         op(&dst);
2377         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2378      }
2379   }
2380}
2381
2382typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
2383                                const union tgsi_exec_channel *src);
2384
2385static void
2386exec_scalar_unary(struct tgsi_exec_machine *mach,
2387                  const struct tgsi_full_instruction *inst,
2388                  micro_unary_op op,
2389                  enum tgsi_exec_datatype dst_datatype,
2390                  enum tgsi_exec_datatype src_datatype)
2391{
2392   unsigned int chan;
2393   union tgsi_exec_channel src;
2394   union tgsi_exec_channel dst;
2395
2396   fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
2397   op(&dst, &src);
2398   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2399      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2400         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2401      }
2402   }
2403}
2404
2405static void
2406exec_vector_unary(struct tgsi_exec_machine *mach,
2407                  const struct tgsi_full_instruction *inst,
2408                  micro_unary_op op,
2409                  enum tgsi_exec_datatype dst_datatype,
2410                  enum tgsi_exec_datatype src_datatype)
2411{
2412   unsigned int chan;
2413   struct tgsi_exec_vector dst;
2414
2415   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2416      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2417         union tgsi_exec_channel src;
2418
2419         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
2420         op(&dst.xyzw[chan], &src);
2421      }
2422   }
2423   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2424      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2425         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2426      }
2427   }
2428}
2429
2430typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
2431                                 const union tgsi_exec_channel *src0,
2432                                 const union tgsi_exec_channel *src1);
2433
2434static void
2435exec_scalar_binary(struct tgsi_exec_machine *mach,
2436                   const struct tgsi_full_instruction *inst,
2437                   micro_binary_op op,
2438                   enum tgsi_exec_datatype dst_datatype,
2439                   enum tgsi_exec_datatype src_datatype)
2440{
2441   unsigned int chan;
2442   union tgsi_exec_channel src[2];
2443   union tgsi_exec_channel dst;
2444
2445   fetch_source(mach, &src[0], &inst->Src[0], CHAN_X, src_datatype);
2446   fetch_source(mach, &src[1], &inst->Src[1], CHAN_Y, src_datatype);
2447   op(&dst, &src[0], &src[1]);
2448   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2449      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2450         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
2451      }
2452   }
2453}
2454
2455static void
2456exec_vector_binary(struct tgsi_exec_machine *mach,
2457                   const struct tgsi_full_instruction *inst,
2458                   micro_binary_op op,
2459                   enum tgsi_exec_datatype dst_datatype,
2460                   enum tgsi_exec_datatype src_datatype)
2461{
2462   unsigned int chan;
2463   struct tgsi_exec_vector dst;
2464
2465   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2466      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2467         union tgsi_exec_channel src[2];
2468
2469         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2470         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2471         op(&dst.xyzw[chan], &src[0], &src[1]);
2472      }
2473   }
2474   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2475      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2476         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2477      }
2478   }
2479}
2480
2481typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
2482                                  const union tgsi_exec_channel *src0,
2483                                  const union tgsi_exec_channel *src1,
2484                                  const union tgsi_exec_channel *src2);
2485
2486static void
2487exec_vector_trinary(struct tgsi_exec_machine *mach,
2488                    const struct tgsi_full_instruction *inst,
2489                    micro_trinary_op op,
2490                    enum tgsi_exec_datatype dst_datatype,
2491                    enum tgsi_exec_datatype src_datatype)
2492{
2493   unsigned int chan;
2494   struct tgsi_exec_vector dst;
2495
2496   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2497      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2498         union tgsi_exec_channel src[3];
2499
2500         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2501         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2502         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
2503         op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
2504      }
2505   }
2506   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2507      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2508         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
2509      }
2510   }
2511}
2512
2513static void
2514exec_dp3(struct tgsi_exec_machine *mach,
2515         const struct tgsi_full_instruction *inst)
2516{
2517   unsigned int chan;
2518   union tgsi_exec_channel arg[3];
2519
2520   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2521   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2522   micro_mul(&arg[2], &arg[0], &arg[1]);
2523
2524   for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2525      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2526      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2527      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2528   }
2529
2530   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2531      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2532         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2533      }
2534   }
2535}
2536
2537static void
2538exec_dp4(struct tgsi_exec_machine *mach,
2539         const struct tgsi_full_instruction *inst)
2540{
2541   unsigned int chan;
2542   union tgsi_exec_channel arg[3];
2543
2544   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2545   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2546   micro_mul(&arg[2], &arg[0], &arg[1]);
2547
2548   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2549      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2550      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
2551      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2552   }
2553
2554   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2555      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2556         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2557      }
2558   }
2559}
2560
2561static void
2562exec_dp2a(struct tgsi_exec_machine *mach,
2563          const struct tgsi_full_instruction *inst)
2564{
2565   unsigned int chan;
2566   union tgsi_exec_channel arg[3];
2567
2568   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2569   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2570   micro_mul(&arg[2], &arg[0], &arg[1]);
2571
2572   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2573   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2574   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2575
2576   fetch_source(mach, &arg[1], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2577   micro_add(&arg[0], &arg[0], &arg[1]);
2578
2579   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2580      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2581         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2582      }
2583   }
2584}
2585
2586static void
2587exec_dph(struct tgsi_exec_machine *mach,
2588         const struct tgsi_full_instruction *inst)
2589{
2590   unsigned int chan;
2591   union tgsi_exec_channel arg[3];
2592
2593   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2594   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2595   micro_mul(&arg[2], &arg[0], &arg[1]);
2596
2597   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2598   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2599   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2600
2601   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2602   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2603   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2604
2605   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2606   micro_add(&arg[0], &arg[0], &arg[1]);
2607
2608   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2609      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2610         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2611      }
2612   }
2613}
2614
2615static void
2616exec_dp2(struct tgsi_exec_machine *mach,
2617         const struct tgsi_full_instruction *inst)
2618{
2619   unsigned int chan;
2620   union tgsi_exec_channel arg[3];
2621
2622   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2623   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2624   micro_mul(&arg[2], &arg[0], &arg[1]);
2625
2626   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2627   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2628   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2629
2630   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2631      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2632         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2633      }
2634   }
2635}
2636
2637static void
2638exec_nrm4(struct tgsi_exec_machine *mach,
2639          const struct tgsi_full_instruction *inst)
2640{
2641   unsigned int chan;
2642   union tgsi_exec_channel arg[4];
2643   union tgsi_exec_channel scale;
2644
2645   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2646   micro_mul(&scale, &arg[0], &arg[0]);
2647
2648   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2649      union tgsi_exec_channel product;
2650
2651      fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2652      micro_mul(&product, &arg[chan], &arg[chan]);
2653      micro_add(&scale, &scale, &product);
2654   }
2655
2656   micro_rsq(&scale, &scale);
2657
2658   for (chan = CHAN_X; chan <= CHAN_W; chan++) {
2659      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2660         micro_mul(&arg[chan], &arg[chan], &scale);
2661         store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2662      }
2663   }
2664}
2665
2666static void
2667exec_nrm3(struct tgsi_exec_machine *mach,
2668          const struct tgsi_full_instruction *inst)
2669{
2670   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2671      unsigned int chan;
2672      union tgsi_exec_channel arg[3];
2673      union tgsi_exec_channel scale;
2674
2675      fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2676      micro_mul(&scale, &arg[0], &arg[0]);
2677
2678      for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2679         union tgsi_exec_channel product;
2680
2681         fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2682         micro_mul(&product, &arg[chan], &arg[chan]);
2683         micro_add(&scale, &scale, &product);
2684      }
2685
2686      micro_rsq(&scale, &scale);
2687
2688      for (chan = CHAN_X; chan <= CHAN_Z; chan++) {
2689         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2690            micro_mul(&arg[chan], &arg[chan], &scale);
2691            store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2692         }
2693      }
2694   }
2695
2696   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2697      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2698   }
2699}
2700
2701static void
2702exec_scs(struct tgsi_exec_machine *mach,
2703         const struct tgsi_full_instruction *inst)
2704{
2705   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) {
2706      union tgsi_exec_channel arg;
2707      union tgsi_exec_channel result;
2708
2709      fetch_source(mach, &arg, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2710
2711      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2712         micro_cos(&result, &arg);
2713         store_dest(mach, &result, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2714      }
2715      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2716         micro_sin(&result, &arg);
2717         store_dest(mach, &result, &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2718      }
2719   }
2720   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2721      store_dest(mach, &ZeroVec, &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2722   }
2723   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2724      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2725   }
2726}
2727
2728static void
2729exec_x2d(struct tgsi_exec_machine *mach,
2730         const struct tgsi_full_instruction *inst)
2731{
2732   union tgsi_exec_channel r[4];
2733   union tgsi_exec_channel d[2];
2734
2735   fetch_source(mach, &r[0], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2736   fetch_source(mach, &r[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2737   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XZ) {
2738      fetch_source(mach, &r[2], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2739      micro_mul(&r[2], &r[2], &r[0]);
2740      fetch_source(mach, &r[3], &inst->Src[2], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2741      micro_mul(&r[3], &r[3], &r[1]);
2742      micro_add(&r[2], &r[2], &r[3]);
2743      fetch_source(mach, &r[3], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2744      micro_add(&d[0], &r[2], &r[3]);
2745   }
2746   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YW) {
2747      fetch_source(mach, &r[2], &inst->Src[2], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2748      micro_mul(&r[2], &r[2], &r[0]);
2749      fetch_source(mach, &r[3], &inst->Src[2], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2750      micro_mul(&r[3], &r[3], &r[1]);
2751      micro_add(&r[2], &r[2], &r[3]);
2752      fetch_source(mach, &r[3], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2753      micro_add(&d[1], &r[2], &r[3]);
2754   }
2755   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2756      store_dest(mach, &d[0], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2757   }
2758   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2759      store_dest(mach, &d[1], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2760   }
2761   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2762      store_dest(mach, &d[0], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2763   }
2764   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2765      store_dest(mach, &d[1], &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2766   }
2767}
2768
2769static void
2770exec_rfl(struct tgsi_exec_machine *mach,
2771         const struct tgsi_full_instruction *inst)
2772{
2773   union tgsi_exec_channel r[9];
2774
2775   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2776      /* r0 = dp3(src0, src0) */
2777      fetch_source(mach, &r[2], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2778      micro_mul(&r[0], &r[2], &r[2]);
2779      fetch_source(mach, &r[4], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2780      micro_mul(&r[8], &r[4], &r[4]);
2781      micro_add(&r[0], &r[0], &r[8]);
2782      fetch_source(mach, &r[6], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2783      micro_mul(&r[8], &r[6], &r[6]);
2784      micro_add(&r[0], &r[0], &r[8]);
2785
2786      /* r1 = dp3(src0, src1) */
2787      fetch_source(mach, &r[3], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2788      micro_mul(&r[1], &r[2], &r[3]);
2789      fetch_source(mach, &r[5], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2790      micro_mul(&r[8], &r[4], &r[5]);
2791      micro_add(&r[1], &r[1], &r[8]);
2792      fetch_source(mach, &r[7], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2793      micro_mul(&r[8], &r[6], &r[7]);
2794      micro_add(&r[1], &r[1], &r[8]);
2795
2796      /* r1 = 2 * r1 / r0 */
2797      micro_add(&r[1], &r[1], &r[1]);
2798      micro_div(&r[1], &r[1], &r[0]);
2799
2800      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2801         micro_mul(&r[2], &r[2], &r[1]);
2802         micro_sub(&r[2], &r[2], &r[3]);
2803         store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2804      }
2805      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2806         micro_mul(&r[4], &r[4], &r[1]);
2807         micro_sub(&r[4], &r[4], &r[5]);
2808         store_dest(mach, &r[4], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2809      }
2810      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2811         micro_mul(&r[6], &r[6], &r[1]);
2812         micro_sub(&r[6], &r[6], &r[7]);
2813         store_dest(mach, &r[6], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2814      }
2815   }
2816   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2817      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2818   }
2819}
2820
2821static void
2822exec_xpd(struct tgsi_exec_machine *mach,
2823         const struct tgsi_full_instruction *inst)
2824{
2825   union tgsi_exec_channel r[6];
2826   union tgsi_exec_channel d[3];
2827
2828   fetch_source(mach, &r[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2829   fetch_source(mach, &r[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2830
2831   micro_mul(&r[2], &r[0], &r[1]);
2832
2833   fetch_source(mach, &r[3], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2834   fetch_source(mach, &r[4], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2835
2836   micro_mul(&r[5], &r[3], &r[4] );
2837   micro_sub(&d[CHAN_X], &r[2], &r[5]);
2838
2839   fetch_source(mach, &r[2], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2840
2841   micro_mul(&r[3], &r[3], &r[2]);
2842
2843   fetch_source(mach, &r[5], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2844
2845   micro_mul(&r[1], &r[1], &r[5]);
2846   micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2847
2848   micro_mul(&r[5], &r[5], &r[4]);
2849   micro_mul(&r[0], &r[0], &r[2]);
2850   micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2851
2852   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2853      store_dest(mach, &d[CHAN_X], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2854   }
2855   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2856      store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2857   }
2858   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2859      store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2860   }
2861   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2862      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2863   }
2864}
2865
2866static void
2867exec_dst(struct tgsi_exec_machine *mach,
2868         const struct tgsi_full_instruction *inst)
2869{
2870   union tgsi_exec_channel r[2];
2871   union tgsi_exec_channel d[4];
2872
2873   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2874      fetch_source(mach, &r[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2875      fetch_source(mach, &r[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2876      micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2877   }
2878   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2879      fetch_source(mach, &d[CHAN_Z], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2880   }
2881   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2882      fetch_source(mach, &d[CHAN_W], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2883   }
2884
2885   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2886      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2887   }
2888   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2889      store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2890   }
2891   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2892      store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2893   }
2894   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2895      store_dest(mach, &d[CHAN_W], &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2896   }
2897}
2898
2899static void
2900exec_log(struct tgsi_exec_machine *mach,
2901         const struct tgsi_full_instruction *inst)
2902{
2903   union tgsi_exec_channel r[3];
2904
2905   fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2906   micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
2907   micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
2908   micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
2909   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2910      store_dest(mach, &r[0], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2911   }
2912   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2913      micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
2914      micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
2915      store_dest(mach, &r[0], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2916   }
2917   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2918      store_dest(mach, &r[1], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2919   }
2920   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2921      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2922   }
2923}
2924
2925static void
2926exec_exp(struct tgsi_exec_machine *mach,
2927         const struct tgsi_full_instruction *inst)
2928{
2929   union tgsi_exec_channel r[3];
2930
2931   fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2932   micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
2933   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2934      micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
2935      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2936   }
2937   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2938      micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
2939      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2940   }
2941   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2942      micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
2943      store_dest(mach, &r[2], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2944   }
2945   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2946      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2947   }
2948}
2949
2950static void
2951exec_lit(struct tgsi_exec_machine *mach,
2952         const struct tgsi_full_instruction *inst)
2953{
2954   union tgsi_exec_channel r[3];
2955   union tgsi_exec_channel d[3];
2956
2957   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2958      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_X, TGSI_EXEC_DATA_FLOAT);
2959   }
2960   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
2961      fetch_source(mach, &r[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2962      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2963         micro_max(&d[CHAN_Y], &r[0], &ZeroVec);
2964         store_dest(mach, &d[CHAN_Y], &inst->Dst[0], inst, CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2965      }
2966
2967      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2968         fetch_source(mach, &r[1], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2969         micro_max(&r[1], &r[1], &ZeroVec);
2970
2971         fetch_source(mach, &r[2], &inst->Src[0], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2972         micro_min(&r[2], &r[2], &P128Vec);
2973         micro_max(&r[2], &r[2], &M128Vec);
2974         micro_pow(&r[1], &r[1], &r[2]);
2975         micro_lt(&d[CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
2976         store_dest(mach, &d[CHAN_Z], &inst->Dst[0], inst, CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2977      }
2978   }
2979   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2980      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2981   }
2982}
2983
2984static void
2985exec_break(struct tgsi_exec_machine *mach)
2986{
2987   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
2988      /* turn off loop channels for each enabled exec channel */
2989      mach->LoopMask &= ~mach->ExecMask;
2990      /* Todo: if mach->LoopMask == 0, jump to end of loop */
2991      UPDATE_EXEC_MASK(mach);
2992   } else {
2993      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
2994
2995      mach->Switch.mask = 0x0;
2996
2997      UPDATE_EXEC_MASK(mach);
2998   }
2999}
3000
3001static void
3002exec_switch(struct tgsi_exec_machine *mach,
3003            const struct tgsi_full_instruction *inst)
3004{
3005   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3006   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3007
3008   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3009   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
3010   mach->Switch.mask = 0x0;
3011   mach->Switch.defaultMask = 0x0;
3012
3013   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3014   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
3015
3016   UPDATE_EXEC_MASK(mach);
3017}
3018
3019static void
3020exec_case(struct tgsi_exec_machine *mach,
3021          const struct tgsi_full_instruction *inst)
3022{
3023   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3024   union tgsi_exec_channel src;
3025   uint mask = 0;
3026
3027   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
3028
3029   if (mach->Switch.selector.u[0] == src.u[0]) {
3030      mask |= 0x1;
3031   }
3032   if (mach->Switch.selector.u[1] == src.u[1]) {
3033      mask |= 0x2;
3034   }
3035   if (mach->Switch.selector.u[2] == src.u[2]) {
3036      mask |= 0x4;
3037   }
3038   if (mach->Switch.selector.u[3] == src.u[3]) {
3039      mask |= 0x8;
3040   }
3041
3042   mach->Switch.defaultMask |= mask;
3043
3044   mach->Switch.mask |= mask & prevMask;
3045
3046   UPDATE_EXEC_MASK(mach);
3047}
3048
3049static void
3050exec_default(struct tgsi_exec_machine *mach)
3051{
3052   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3053
3054   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
3055
3056   UPDATE_EXEC_MASK(mach);
3057}
3058
3059static void
3060exec_endswitch(struct tgsi_exec_machine *mach)
3061{
3062   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
3063   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3064
3065   UPDATE_EXEC_MASK(mach);
3066}
3067
3068static void
3069micro_i2f(union tgsi_exec_channel *dst,
3070          const union tgsi_exec_channel *src)
3071{
3072   dst->f[0] = (float)src->i[0];
3073   dst->f[1] = (float)src->i[1];
3074   dst->f[2] = (float)src->i[2];
3075   dst->f[3] = (float)src->i[3];
3076}
3077
3078static void
3079micro_not(union tgsi_exec_channel *dst,
3080          const union tgsi_exec_channel *src)
3081{
3082   dst->u[0] = ~src->u[0];
3083   dst->u[1] = ~src->u[1];
3084   dst->u[2] = ~src->u[2];
3085   dst->u[3] = ~src->u[3];
3086}
3087
3088static void
3089micro_shl(union tgsi_exec_channel *dst,
3090          const union tgsi_exec_channel *src0,
3091          const union tgsi_exec_channel *src1)
3092{
3093   dst->u[0] = src0->u[0] << src1->u[0];
3094   dst->u[1] = src0->u[1] << src1->u[1];
3095   dst->u[2] = src0->u[2] << src1->u[2];
3096   dst->u[3] = src0->u[3] << src1->u[3];
3097}
3098
3099static void
3100micro_and(union tgsi_exec_channel *dst,
3101          const union tgsi_exec_channel *src0,
3102          const union tgsi_exec_channel *src1)
3103{
3104   dst->u[0] = src0->u[0] & src1->u[0];
3105   dst->u[1] = src0->u[1] & src1->u[1];
3106   dst->u[2] = src0->u[2] & src1->u[2];
3107   dst->u[3] = src0->u[3] & src1->u[3];
3108}
3109
3110static void
3111micro_or(union tgsi_exec_channel *dst,
3112         const union tgsi_exec_channel *src0,
3113         const union tgsi_exec_channel *src1)
3114{
3115   dst->u[0] = src0->u[0] | src1->u[0];
3116   dst->u[1] = src0->u[1] | src1->u[1];
3117   dst->u[2] = src0->u[2] | src1->u[2];
3118   dst->u[3] = src0->u[3] | src1->u[3];
3119}
3120
3121static void
3122micro_xor(union tgsi_exec_channel *dst,
3123          const union tgsi_exec_channel *src0,
3124          const union tgsi_exec_channel *src1)
3125{
3126   dst->u[0] = src0->u[0] ^ src1->u[0];
3127   dst->u[1] = src0->u[1] ^ src1->u[1];
3128   dst->u[2] = src0->u[2] ^ src1->u[2];
3129   dst->u[3] = src0->u[3] ^ src1->u[3];
3130}
3131
3132static void
3133micro_mod(union tgsi_exec_channel *dst,
3134          const union tgsi_exec_channel *src0,
3135          const union tgsi_exec_channel *src1)
3136{
3137   dst->i[0] = src0->i[0] % src1->i[0];
3138   dst->i[1] = src0->i[1] % src1->i[1];
3139   dst->i[2] = src0->i[2] % src1->i[2];
3140   dst->i[3] = src0->i[3] % src1->i[3];
3141}
3142
3143static void
3144micro_f2i(union tgsi_exec_channel *dst,
3145          const union tgsi_exec_channel *src)
3146{
3147   dst->i[0] = (int)src->f[0];
3148   dst->i[1] = (int)src->f[1];
3149   dst->i[2] = (int)src->f[2];
3150   dst->i[3] = (int)src->f[3];
3151}
3152
3153static void
3154micro_idiv(union tgsi_exec_channel *dst,
3155           const union tgsi_exec_channel *src0,
3156           const union tgsi_exec_channel *src1)
3157{
3158   dst->i[0] = src0->i[0] / src1->i[0];
3159   dst->i[1] = src0->i[1] / src1->i[1];
3160   dst->i[2] = src0->i[2] / src1->i[2];
3161   dst->i[3] = src0->i[3] / src1->i[3];
3162}
3163
3164static void
3165micro_imax(union tgsi_exec_channel *dst,
3166           const union tgsi_exec_channel *src0,
3167           const union tgsi_exec_channel *src1)
3168{
3169   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
3170   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
3171   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
3172   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
3173}
3174
3175static void
3176micro_imin(union tgsi_exec_channel *dst,
3177           const union tgsi_exec_channel *src0,
3178           const union tgsi_exec_channel *src1)
3179{
3180   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
3181   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
3182   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
3183   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
3184}
3185
3186static void
3187micro_isge(union tgsi_exec_channel *dst,
3188           const union tgsi_exec_channel *src0,
3189           const union tgsi_exec_channel *src1)
3190{
3191   dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
3192   dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
3193   dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
3194   dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
3195}
3196
3197static void
3198micro_ishr(union tgsi_exec_channel *dst,
3199           const union tgsi_exec_channel *src0,
3200           const union tgsi_exec_channel *src1)
3201{
3202   dst->i[0] = src0->i[0] >> src1->i[0];
3203   dst->i[1] = src0->i[1] >> src1->i[1];
3204   dst->i[2] = src0->i[2] >> src1->i[2];
3205   dst->i[3] = src0->i[3] >> src1->i[3];
3206}
3207
3208static void
3209micro_islt(union tgsi_exec_channel *dst,
3210           const union tgsi_exec_channel *src0,
3211           const union tgsi_exec_channel *src1)
3212{
3213   dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
3214   dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
3215   dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
3216   dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
3217}
3218
3219static void
3220micro_f2u(union tgsi_exec_channel *dst,
3221          const union tgsi_exec_channel *src)
3222{
3223   dst->u[0] = (uint)src->f[0];
3224   dst->u[1] = (uint)src->f[1];
3225   dst->u[2] = (uint)src->f[2];
3226   dst->u[3] = (uint)src->f[3];
3227}
3228
3229static void
3230micro_u2f(union tgsi_exec_channel *dst,
3231          const union tgsi_exec_channel *src)
3232{
3233   dst->f[0] = (float)src->u[0];
3234   dst->f[1] = (float)src->u[1];
3235   dst->f[2] = (float)src->u[2];
3236   dst->f[3] = (float)src->u[3];
3237}
3238
3239static void
3240micro_uadd(union tgsi_exec_channel *dst,
3241           const union tgsi_exec_channel *src0,
3242           const union tgsi_exec_channel *src1)
3243{
3244   dst->u[0] = src0->u[0] + src1->u[0];
3245   dst->u[1] = src0->u[1] + src1->u[1];
3246   dst->u[2] = src0->u[2] + src1->u[2];
3247   dst->u[3] = src0->u[3] + src1->u[3];
3248}
3249
3250static void
3251micro_udiv(union tgsi_exec_channel *dst,
3252           const union tgsi_exec_channel *src0,
3253           const union tgsi_exec_channel *src1)
3254{
3255   dst->u[0] = src0->u[0] / src1->u[0];
3256   dst->u[1] = src0->u[1] / src1->u[1];
3257   dst->u[2] = src0->u[2] / src1->u[2];
3258   dst->u[3] = src0->u[3] / src1->u[3];
3259}
3260
3261static void
3262micro_umad(union tgsi_exec_channel *dst,
3263           const union tgsi_exec_channel *src0,
3264           const union tgsi_exec_channel *src1,
3265           const union tgsi_exec_channel *src2)
3266{
3267   dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
3268   dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
3269   dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
3270   dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
3271}
3272
3273static void
3274micro_umax(union tgsi_exec_channel *dst,
3275           const union tgsi_exec_channel *src0,
3276           const union tgsi_exec_channel *src1)
3277{
3278   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
3279   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
3280   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
3281   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
3282}
3283
3284static void
3285micro_umin(union tgsi_exec_channel *dst,
3286           const union tgsi_exec_channel *src0,
3287           const union tgsi_exec_channel *src1)
3288{
3289   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
3290   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
3291   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
3292   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
3293}
3294
3295static void
3296micro_umod(union tgsi_exec_channel *dst,
3297           const union tgsi_exec_channel *src0,
3298           const union tgsi_exec_channel *src1)
3299{
3300   dst->u[0] = src0->u[0] % src1->u[0];
3301   dst->u[1] = src0->u[1] % src1->u[1];
3302   dst->u[2] = src0->u[2] % src1->u[2];
3303   dst->u[3] = src0->u[3] % src1->u[3];
3304}
3305
3306static void
3307micro_umul(union tgsi_exec_channel *dst,
3308           const union tgsi_exec_channel *src0,
3309           const union tgsi_exec_channel *src1)
3310{
3311   dst->u[0] = src0->u[0] * src1->u[0];
3312   dst->u[1] = src0->u[1] * src1->u[1];
3313   dst->u[2] = src0->u[2] * src1->u[2];
3314   dst->u[3] = src0->u[3] * src1->u[3];
3315}
3316
3317static void
3318micro_useq(union tgsi_exec_channel *dst,
3319           const union tgsi_exec_channel *src0,
3320           const union tgsi_exec_channel *src1)
3321{
3322   dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
3323   dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
3324   dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
3325   dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
3326}
3327
3328static void
3329micro_usge(union tgsi_exec_channel *dst,
3330           const union tgsi_exec_channel *src0,
3331           const union tgsi_exec_channel *src1)
3332{
3333   dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
3334   dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
3335   dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
3336   dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
3337}
3338
3339static void
3340micro_ushr(union tgsi_exec_channel *dst,
3341           const union tgsi_exec_channel *src0,
3342           const union tgsi_exec_channel *src1)
3343{
3344   dst->u[0] = src0->u[0] >> src1->u[0];
3345   dst->u[1] = src0->u[1] >> src1->u[1];
3346   dst->u[2] = src0->u[2] >> src1->u[2];
3347   dst->u[3] = src0->u[3] >> src1->u[3];
3348}
3349
3350static void
3351micro_uslt(union tgsi_exec_channel *dst,
3352           const union tgsi_exec_channel *src0,
3353           const union tgsi_exec_channel *src1)
3354{
3355   dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
3356   dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
3357   dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
3358   dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
3359}
3360
3361static void
3362micro_usne(union tgsi_exec_channel *dst,
3363           const union tgsi_exec_channel *src0,
3364           const union tgsi_exec_channel *src1)
3365{
3366   dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
3367   dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
3368   dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
3369   dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
3370}
3371
3372static void
3373micro_uarl(union tgsi_exec_channel *dst,
3374           const union tgsi_exec_channel *src)
3375{
3376   dst->i[0] = src->u[0];
3377   dst->i[1] = src->u[1];
3378   dst->i[2] = src->u[2];
3379   dst->i[3] = src->u[3];
3380}
3381
3382static void
3383micro_ucmp(union tgsi_exec_channel *dst,
3384           const union tgsi_exec_channel *src0,
3385           const union tgsi_exec_channel *src1,
3386           const union tgsi_exec_channel *src2)
3387{
3388   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
3389   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
3390   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
3391   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
3392}
3393
3394static void
3395exec_instruction(
3396   struct tgsi_exec_machine *mach,
3397   const struct tgsi_full_instruction *inst,
3398   int *pc )
3399{
3400   union tgsi_exec_channel r[10];
3401
3402   (*pc)++;
3403
3404   switch (inst->Instruction.Opcode) {
3405   case TGSI_OPCODE_ARL:
3406      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3407      break;
3408
3409   case TGSI_OPCODE_MOV:
3410      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3411      break;
3412
3413   case TGSI_OPCODE_LIT:
3414      exec_lit(mach, inst);
3415      break;
3416
3417   case TGSI_OPCODE_RCP:
3418      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3419      break;
3420
3421   case TGSI_OPCODE_RSQ:
3422      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3423      break;
3424
3425   case TGSI_OPCODE_EXP:
3426      exec_exp(mach, inst);
3427      break;
3428
3429   case TGSI_OPCODE_LOG:
3430      exec_log(mach, inst);
3431      break;
3432
3433   case TGSI_OPCODE_MUL:
3434      exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3435      break;
3436
3437   case TGSI_OPCODE_ADD:
3438      exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3439      break;
3440
3441   case TGSI_OPCODE_DP3:
3442      exec_dp3(mach, inst);
3443      break;
3444
3445   case TGSI_OPCODE_DP4:
3446      exec_dp4(mach, inst);
3447      break;
3448
3449   case TGSI_OPCODE_DST:
3450      exec_dst(mach, inst);
3451      break;
3452
3453   case TGSI_OPCODE_MIN:
3454      exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3455      break;
3456
3457   case TGSI_OPCODE_MAX:
3458      exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3459      break;
3460
3461   case TGSI_OPCODE_SLT:
3462      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3463      break;
3464
3465   case TGSI_OPCODE_SGE:
3466      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3467      break;
3468
3469   case TGSI_OPCODE_MAD:
3470      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3471      break;
3472
3473   case TGSI_OPCODE_SUB:
3474      exec_vector_binary(mach, inst, micro_sub, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3475      break;
3476
3477   case TGSI_OPCODE_LRP:
3478      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3479      break;
3480
3481   case TGSI_OPCODE_CND:
3482      exec_vector_trinary(mach, inst, micro_cnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3483      break;
3484
3485   case TGSI_OPCODE_DP2A:
3486      exec_dp2a(mach, inst);
3487      break;
3488
3489   case TGSI_OPCODE_FRC:
3490      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3491      break;
3492
3493   case TGSI_OPCODE_CLAMP:
3494      exec_vector_trinary(mach, inst, micro_clamp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3495      break;
3496
3497   case TGSI_OPCODE_FLR:
3498      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3499      break;
3500
3501   case TGSI_OPCODE_ROUND:
3502      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3503      break;
3504
3505   case TGSI_OPCODE_EX2:
3506      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3507      break;
3508
3509   case TGSI_OPCODE_LG2:
3510      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3511      break;
3512
3513   case TGSI_OPCODE_POW:
3514      exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3515      break;
3516
3517   case TGSI_OPCODE_XPD:
3518      exec_xpd(mach, inst);
3519      break;
3520
3521   case TGSI_OPCODE_ABS:
3522      exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3523      break;
3524
3525   case TGSI_OPCODE_RCC:
3526      exec_scalar_unary(mach, inst, micro_rcc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3527      break;
3528
3529   case TGSI_OPCODE_DPH:
3530      exec_dph(mach, inst);
3531      break;
3532
3533   case TGSI_OPCODE_COS:
3534      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3535      break;
3536
3537   case TGSI_OPCODE_DDX:
3538      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3539      break;
3540
3541   case TGSI_OPCODE_DDY:
3542      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3543      break;
3544
3545   case TGSI_OPCODE_KILP:
3546      exec_kilp (mach, inst);
3547      break;
3548
3549   case TGSI_OPCODE_KIL:
3550      exec_kil (mach, inst);
3551      break;
3552
3553   case TGSI_OPCODE_PK2H:
3554      assert (0);
3555      break;
3556
3557   case TGSI_OPCODE_PK2US:
3558      assert (0);
3559      break;
3560
3561   case TGSI_OPCODE_PK4B:
3562      assert (0);
3563      break;
3564
3565   case TGSI_OPCODE_PK4UB:
3566      assert (0);
3567      break;
3568
3569   case TGSI_OPCODE_RFL:
3570      exec_rfl(mach, inst);
3571      break;
3572
3573   case TGSI_OPCODE_SEQ:
3574      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3575      break;
3576
3577   case TGSI_OPCODE_SFL:
3578      exec_vector(mach, inst, micro_sfl, TGSI_EXEC_DATA_FLOAT);
3579      break;
3580
3581   case TGSI_OPCODE_SGT:
3582      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3583      break;
3584
3585   case TGSI_OPCODE_SIN:
3586      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3587      break;
3588
3589   case TGSI_OPCODE_SLE:
3590      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3591      break;
3592
3593   case TGSI_OPCODE_SNE:
3594      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3595      break;
3596
3597   case TGSI_OPCODE_STR:
3598      exec_vector(mach, inst, micro_str, TGSI_EXEC_DATA_FLOAT);
3599      break;
3600
3601   case TGSI_OPCODE_TEX:
3602      /* simple texture lookup */
3603      /* src[0] = texcoord */
3604      /* src[1] = sampler unit */
3605      exec_tex(mach, inst, TEX_MODIFIER_NONE);
3606      break;
3607
3608   case TGSI_OPCODE_TXB:
3609      /* Texture lookup with lod bias */
3610      /* src[0] = texcoord (src[0].w = LOD bias) */
3611      /* src[1] = sampler unit */
3612      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
3613      break;
3614
3615   case TGSI_OPCODE_TXD:
3616      /* Texture lookup with explict partial derivatives */
3617      /* src[0] = texcoord */
3618      /* src[1] = d[strq]/dx */
3619      /* src[2] = d[strq]/dy */
3620      /* src[3] = sampler unit */
3621      exec_txd(mach, inst);
3622      break;
3623
3624   case TGSI_OPCODE_TXL:
3625      /* Texture lookup with explit LOD */
3626      /* src[0] = texcoord (src[0].w = LOD) */
3627      /* src[1] = sampler unit */
3628      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
3629      break;
3630
3631   case TGSI_OPCODE_TXP:
3632      /* Texture lookup with projection */
3633      /* src[0] = texcoord (src[0].w = projection) */
3634      /* src[1] = sampler unit */
3635      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
3636      break;
3637
3638   case TGSI_OPCODE_UP2H:
3639      assert (0);
3640      break;
3641
3642   case TGSI_OPCODE_UP2US:
3643      assert (0);
3644      break;
3645
3646   case TGSI_OPCODE_UP4B:
3647      assert (0);
3648      break;
3649
3650   case TGSI_OPCODE_UP4UB:
3651      assert (0);
3652      break;
3653
3654   case TGSI_OPCODE_X2D:
3655      exec_x2d(mach, inst);
3656      break;
3657
3658   case TGSI_OPCODE_ARA:
3659      assert (0);
3660      break;
3661
3662   case TGSI_OPCODE_ARR:
3663      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3664      break;
3665
3666   case TGSI_OPCODE_BRA:
3667      assert (0);
3668      break;
3669
3670   case TGSI_OPCODE_CAL:
3671      /* skip the call if no execution channels are enabled */
3672      if (mach->ExecMask) {
3673         /* do the call */
3674
3675         /* First, record the depths of the execution stacks.
3676          * This is important for deeply nested/looped return statements.
3677          * We have to unwind the stacks by the correct amount.  For a
3678          * real code generator, we could determine the number of entries
3679          * to pop off each stack with simple static analysis and avoid
3680          * implementing this data structure at run time.
3681          */
3682         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
3683         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
3684         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
3685         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
3686         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
3687         /* note that PC was already incremented above */
3688         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
3689
3690         mach->CallStackTop++;
3691
3692         /* Second, push the Cond, Loop, Cont, Func stacks */
3693         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3694         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3695         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3696         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3697         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3698         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
3699
3700         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3701         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3702         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3703         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3704         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3705         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
3706
3707         /* Finally, jump to the subroutine */
3708         *pc = inst->Label.Label;
3709      }
3710      break;
3711
3712   case TGSI_OPCODE_RET:
3713      mach->FuncMask &= ~mach->ExecMask;
3714      UPDATE_EXEC_MASK(mach);
3715
3716      if (mach->FuncMask == 0x0) {
3717         /* really return now (otherwise, keep executing */
3718
3719         if (mach->CallStackTop == 0) {
3720            /* returning from main() */
3721            mach->CondStackTop = 0;
3722            mach->LoopStackTop = 0;
3723            *pc = -1;
3724            return;
3725         }
3726
3727         assert(mach->CallStackTop > 0);
3728         mach->CallStackTop--;
3729
3730         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3731         mach->CondMask = mach->CondStack[mach->CondStackTop];
3732
3733         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3734         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3735
3736         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3737         mach->ContMask = mach->ContStack[mach->ContStackTop];
3738
3739         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3740         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3741
3742         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3743         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3744
3745         assert(mach->FuncStackTop > 0);
3746         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3747
3748         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3749
3750         UPDATE_EXEC_MASK(mach);
3751      }
3752      break;
3753
3754   case TGSI_OPCODE_SSG:
3755      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3756      break;
3757
3758   case TGSI_OPCODE_CMP:
3759      exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3760      break;
3761
3762   case TGSI_OPCODE_SCS:
3763      exec_scs(mach, inst);
3764      break;
3765
3766   case TGSI_OPCODE_NRM:
3767      exec_nrm3(mach, inst);
3768      break;
3769
3770   case TGSI_OPCODE_NRM4:
3771      exec_nrm4(mach, inst);
3772      break;
3773
3774   case TGSI_OPCODE_DIV:
3775      exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3776      break;
3777
3778   case TGSI_OPCODE_DP2:
3779      exec_dp2(mach, inst);
3780      break;
3781
3782   case TGSI_OPCODE_IF:
3783      /* push CondMask */
3784      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3785      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3786      FETCH( &r[0], 0, CHAN_X );
3787      /* update CondMask */
3788      if( ! r[0].u[0] ) {
3789         mach->CondMask &= ~0x1;
3790      }
3791      if( ! r[0].u[1] ) {
3792         mach->CondMask &= ~0x2;
3793      }
3794      if( ! r[0].u[2] ) {
3795         mach->CondMask &= ~0x4;
3796      }
3797      if( ! r[0].u[3] ) {
3798         mach->CondMask &= ~0x8;
3799      }
3800      UPDATE_EXEC_MASK(mach);
3801      /* Todo: If CondMask==0, jump to ELSE */
3802      break;
3803
3804   case TGSI_OPCODE_ELSE:
3805      /* invert CondMask wrt previous mask */
3806      {
3807         uint prevMask;
3808         assert(mach->CondStackTop > 0);
3809         prevMask = mach->CondStack[mach->CondStackTop - 1];
3810         mach->CondMask = ~mach->CondMask & prevMask;
3811         UPDATE_EXEC_MASK(mach);
3812         /* Todo: If CondMask==0, jump to ENDIF */
3813      }
3814      break;
3815
3816   case TGSI_OPCODE_ENDIF:
3817      /* pop CondMask */
3818      assert(mach->CondStackTop > 0);
3819      mach->CondMask = mach->CondStack[--mach->CondStackTop];
3820      UPDATE_EXEC_MASK(mach);
3821      break;
3822
3823   case TGSI_OPCODE_END:
3824      /* make sure we end primitives which haven't
3825       * been explicitly emitted */
3826      conditional_emit_primitive(mach);
3827      /* halt execution */
3828      *pc = -1;
3829      break;
3830
3831   case TGSI_OPCODE_PUSHA:
3832      assert (0);
3833      break;
3834
3835   case TGSI_OPCODE_POPA:
3836      assert (0);
3837      break;
3838
3839   case TGSI_OPCODE_CEIL:
3840      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3841      break;
3842
3843   case TGSI_OPCODE_I2F:
3844      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
3845      break;
3846
3847   case TGSI_OPCODE_NOT:
3848      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3849      break;
3850
3851   case TGSI_OPCODE_TRUNC:
3852      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3853      break;
3854
3855   case TGSI_OPCODE_SHL:
3856      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3857      break;
3858
3859   case TGSI_OPCODE_AND:
3860      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3861      break;
3862
3863   case TGSI_OPCODE_OR:
3864      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3865      break;
3866
3867   case TGSI_OPCODE_MOD:
3868      exec_vector_binary(mach, inst, micro_mod, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3869      break;
3870
3871   case TGSI_OPCODE_XOR:
3872      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3873      break;
3874
3875   case TGSI_OPCODE_SAD:
3876      assert (0);
3877      break;
3878
3879   case TGSI_OPCODE_TXF:
3880      exec_txf(mach, inst);
3881      break;
3882
3883   case TGSI_OPCODE_TXQ:
3884      exec_txq(mach, inst);
3885      break;
3886
3887   case TGSI_OPCODE_EMIT:
3888      emit_vertex(mach);
3889      break;
3890
3891   case TGSI_OPCODE_ENDPRIM:
3892      emit_primitive(mach);
3893      break;
3894
3895   case TGSI_OPCODE_BGNLOOP:
3896      /* push LoopMask and ContMasks */
3897      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3898      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3899      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3900      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3901
3902      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3903      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3904      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3905      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3906      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
3907      break;
3908
3909   case TGSI_OPCODE_ENDLOOP:
3910      /* Restore ContMask, but don't pop */
3911      assert(mach->ContStackTop > 0);
3912      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3913      UPDATE_EXEC_MASK(mach);
3914      if (mach->ExecMask) {
3915         /* repeat loop: jump to instruction just past BGNLOOP */
3916         assert(mach->LoopLabelStackTop > 0);
3917         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3918      }
3919      else {
3920         /* exit loop: pop LoopMask */
3921         assert(mach->LoopStackTop > 0);
3922         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3923         /* pop ContMask */
3924         assert(mach->ContStackTop > 0);
3925         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3926         assert(mach->LoopLabelStackTop > 0);
3927         --mach->LoopLabelStackTop;
3928
3929         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3930      }
3931      UPDATE_EXEC_MASK(mach);
3932      break;
3933
3934   case TGSI_OPCODE_BRK:
3935      exec_break(mach);
3936      break;
3937
3938   case TGSI_OPCODE_CONT:
3939      /* turn off cont channels for each enabled exec channel */
3940      mach->ContMask &= ~mach->ExecMask;
3941      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3942      UPDATE_EXEC_MASK(mach);
3943      break;
3944
3945   case TGSI_OPCODE_BGNSUB:
3946      /* no-op */
3947      break;
3948
3949   case TGSI_OPCODE_ENDSUB:
3950      /*
3951       * XXX: This really should be a no-op. We should never reach this opcode.
3952       */
3953
3954      assert(mach->CallStackTop > 0);
3955      mach->CallStackTop--;
3956
3957      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3958      mach->CondMask = mach->CondStack[mach->CondStackTop];
3959
3960      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3961      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3962
3963      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3964      mach->ContMask = mach->ContStack[mach->ContStackTop];
3965
3966      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3967      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3968
3969      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3970      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3971
3972      assert(mach->FuncStackTop > 0);
3973      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3974
3975      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3976
3977      UPDATE_EXEC_MASK(mach);
3978      break;
3979
3980   case TGSI_OPCODE_NOP:
3981      break;
3982
3983   case TGSI_OPCODE_BREAKC:
3984      FETCH(&r[0], 0, CHAN_X);
3985      /* update CondMask */
3986      if (r[0].u[0] && (mach->ExecMask & 0x1)) {
3987         mach->LoopMask &= ~0x1;
3988      }
3989      if (r[0].u[1] && (mach->ExecMask & 0x2)) {
3990         mach->LoopMask &= ~0x2;
3991      }
3992      if (r[0].u[2] && (mach->ExecMask & 0x4)) {
3993         mach->LoopMask &= ~0x4;
3994      }
3995      if (r[0].u[3] && (mach->ExecMask & 0x8)) {
3996         mach->LoopMask &= ~0x8;
3997      }
3998      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3999      UPDATE_EXEC_MASK(mach);
4000      break;
4001
4002   case TGSI_OPCODE_F2I:
4003      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
4004      break;
4005
4006   case TGSI_OPCODE_IDIV:
4007      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4008      break;
4009
4010   case TGSI_OPCODE_IMAX:
4011      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4012      break;
4013
4014   case TGSI_OPCODE_IMIN:
4015      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4016      break;
4017
4018   case TGSI_OPCODE_INEG:
4019      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4020      break;
4021
4022   case TGSI_OPCODE_ISGE:
4023      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4024      break;
4025
4026   case TGSI_OPCODE_ISHR:
4027      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4028      break;
4029
4030   case TGSI_OPCODE_ISLT:
4031      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
4032      break;
4033
4034   case TGSI_OPCODE_F2U:
4035      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
4036      break;
4037
4038   case TGSI_OPCODE_U2F:
4039      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
4040      break;
4041
4042   case TGSI_OPCODE_UADD:
4043      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4044      break;
4045
4046   case TGSI_OPCODE_UDIV:
4047      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4048      break;
4049
4050   case TGSI_OPCODE_UMAD:
4051      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4052      break;
4053
4054   case TGSI_OPCODE_UMAX:
4055      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4056      break;
4057
4058   case TGSI_OPCODE_UMIN:
4059      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4060      break;
4061
4062   case TGSI_OPCODE_UMOD:
4063      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4064      break;
4065
4066   case TGSI_OPCODE_UMUL:
4067      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4068      break;
4069
4070   case TGSI_OPCODE_USEQ:
4071      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4072      break;
4073
4074   case TGSI_OPCODE_USGE:
4075      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4076      break;
4077
4078   case TGSI_OPCODE_USHR:
4079      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4080      break;
4081
4082   case TGSI_OPCODE_USLT:
4083      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4084      break;
4085
4086   case TGSI_OPCODE_USNE:
4087      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4088      break;
4089
4090   case TGSI_OPCODE_SWITCH:
4091      exec_switch(mach, inst);
4092      break;
4093
4094   case TGSI_OPCODE_CASE:
4095      exec_case(mach, inst);
4096      break;
4097
4098   case TGSI_OPCODE_DEFAULT:
4099      exec_default(mach);
4100      break;
4101
4102   case TGSI_OPCODE_ENDSWITCH:
4103      exec_endswitch(mach);
4104      break;
4105
4106   case TGSI_OPCODE_LOAD:
4107      assert(0);
4108      break;
4109
4110   case TGSI_OPCODE_LOAD_MS:
4111      assert(0);
4112      break;
4113
4114   case TGSI_OPCODE_SAMPLE:
4115      exec_sample(mach, inst, TEX_MODIFIER_NONE);
4116      break;
4117
4118   case TGSI_OPCODE_SAMPLE_B:
4119      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
4120      break;
4121
4122   case TGSI_OPCODE_SAMPLE_C:
4123      exec_sample(mach, inst, TEX_MODIFIER_NONE);
4124      break;
4125
4126   case TGSI_OPCODE_SAMPLE_C_LZ:
4127      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS);
4128      break;
4129
4130   case TGSI_OPCODE_SAMPLE_D:
4131      exec_sample_d(mach, inst);
4132      break;
4133
4134   case TGSI_OPCODE_SAMPLE_L:
4135      exec_sample(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
4136      break;
4137
4138   case TGSI_OPCODE_GATHER4:
4139      assert(0);
4140      break;
4141
4142   case TGSI_OPCODE_RESINFO:
4143      assert(0);
4144      break;
4145
4146   case TGSI_OPCODE_SAMPLE_POS:
4147      assert(0);
4148      break;
4149
4150   case TGSI_OPCODE_SAMPLE_INFO:
4151      assert(0);
4152      break;
4153
4154   case TGSI_OPCODE_UARL:
4155      exec_vector_unary(mach, inst, micro_uarl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
4156      break;
4157
4158   case TGSI_OPCODE_UCMP:
4159      exec_vector_trinary(mach, inst, micro_ucmp, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
4160      break;
4161
4162   default:
4163      assert( 0 );
4164   }
4165}
4166
4167
4168#define DEBUG_EXECUTION 0
4169
4170
4171/**
4172 * Run TGSI interpreter.
4173 * \return bitmask of "alive" quad components
4174 */
4175uint
4176tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
4177{
4178   uint i;
4179   int pc = 0;
4180
4181   mach->CondMask = 0xf;
4182   mach->LoopMask = 0xf;
4183   mach->ContMask = 0xf;
4184   mach->FuncMask = 0xf;
4185   mach->ExecMask = 0xf;
4186
4187   mach->Switch.mask = 0xf;
4188
4189   assert(mach->CondStackTop == 0);
4190   assert(mach->LoopStackTop == 0);
4191   assert(mach->ContStackTop == 0);
4192   assert(mach->SwitchStackTop == 0);
4193   assert(mach->BreakStackTop == 0);
4194   assert(mach->CallStackTop == 0);
4195
4196   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
4197   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
4198
4199   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
4200      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
4201      mach->Primitives[0] = 0;
4202   }
4203
4204   /* execute declarations (interpolants) */
4205   for (i = 0; i < mach->NumDeclarations; i++) {
4206      exec_declaration( mach, mach->Declarations+i );
4207   }
4208
4209   {
4210#if DEBUG_EXECUTION
4211      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
4212      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
4213      uint inst = 1;
4214
4215      memcpy(temps, mach->Temps, sizeof(temps));
4216      memcpy(outputs, mach->Outputs, sizeof(outputs));
4217#endif
4218
4219      /* execute instructions, until pc is set to -1 */
4220      while (pc != -1) {
4221
4222#if DEBUG_EXECUTION
4223         uint i;
4224
4225         tgsi_dump_instruction(&mach->Instructions[pc], inst++);
4226#endif
4227
4228         assert(pc < (int) mach->NumInstructions);
4229         exec_instruction(mach, mach->Instructions + pc, &pc);
4230
4231#if DEBUG_EXECUTION
4232         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
4233            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
4234               uint j;
4235
4236               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
4237               debug_printf("TEMP[%2u] = ", i);
4238               for (j = 0; j < 4; j++) {
4239                  if (j > 0) {
4240                     debug_printf("           ");
4241                  }
4242                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
4243                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
4244                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
4245                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
4246                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
4247               }
4248            }
4249         }
4250         for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
4251            if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
4252               uint j;
4253
4254               memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
4255               debug_printf("OUT[%2u] =  ", i);
4256               for (j = 0; j < 4; j++) {
4257                  if (j > 0) {
4258                     debug_printf("           ");
4259                  }
4260                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
4261                               outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
4262                               outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
4263                               outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
4264                               outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
4265               }
4266            }
4267         }
4268#endif
4269      }
4270   }
4271
4272#if 0
4273   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
4274   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
4275      /*
4276       * Scale back depth component.
4277       */
4278      for (i = 0; i < 4; i++)
4279         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
4280   }
4281#endif
4282
4283   /* Strictly speaking, these assertions aren't really needed but they
4284    * can potentially catch some bugs in the control flow code.
4285    */
4286   assert(mach->CondStackTop == 0);
4287   assert(mach->LoopStackTop == 0);
4288   assert(mach->ContStackTop == 0);
4289   assert(mach->SwitchStackTop == 0);
4290   assert(mach->BreakStackTop == 0);
4291   assert(mach->CallStackTop == 0);
4292
4293   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4294}
4295